-
-
Notifications
You must be signed in to change notification settings - Fork 11.9k
ENH add hash based unique #26018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH add hash based unique #26018
Changes from 1 commit
2933930
8da2e72
961ef5b
a6b1847
1f1c36c
0bc43c3
f94cf89
a37b151
8f42b0b
f56634f
bce7534
0c6c588
9b7d6f6
85cf692
3db3349
8d4b6be
9e7d671
a4e8a29
6a8c69c
0c3b889
cc39a50
92adb26
8b7ad2e
8f95240
1d0c596
ed4ea89
8adbf70
a8e69ff
cdf3af9
fc1d50e
5dbdf48
c8b9d22
a9df742
5333e80
3bb7c97
95a577b
724b794
1abc6b5
214cd06
8c184ab
113e021
c733f75
ae0e936
b50e7f3
712a5cf
8a45f04
4999daa
ab86574
08d7d62
e1e2ddf
f96411a
2319947
e188bf3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,9 @@ | ||
| #define NPY_NO_DEPRECATED_API NPY_API_VERSION | ||
|
|
||
| #include <ctime> | ||
| #include <unordered_map> | ||
| #include <map> | ||
| #include <vector> | ||
| #include <random> | ||
| #include <iostream> | ||
| #include <string> | ||
|
|
||
| #define _MULTIARRAYMODULE | ||
| #include "numpy/ndarraytypes.h" | ||
|
|
@@ -15,99 +13,15 @@ | |
|
|
||
| #include "numpy/npy_2_compat.h" | ||
|
|
||
|
|
||
| template <typename T> | ||
| T *random_data(std::size_t size, std::size_t max, T type) | ||
| { | ||
| std::random_device dev; | ||
| std::mt19937 rng(dev()); | ||
| std::uniform_int_distribution<std::mt19937::result_type> rnd(0, max); | ||
| T *res = new T[size]; | ||
| for (std::size_t i = 0; i < size; i++) | ||
| { | ||
| res[i] = rnd(rng); | ||
| } | ||
| return res; | ||
| } | ||
|
|
||
| void process_args(int argc, char *argv[], std::string &alg, std::size_t &size, std::size_t &max) | ||
| { | ||
| if (argc != 4) | ||
| { | ||
| std::cerr << "Usage: " << argv[0] << " {hash,rbt} <size> <max>" << std::endl; | ||
| std::exit(1); | ||
| } | ||
| alg = argv[1]; | ||
| size = (std::size_t)std::stoi(argv[2]); | ||
| max = (std::size_t)std::stoi(argv[3]); | ||
| } | ||
|
|
||
| template <typename ContainerType, typename DataType> | ||
| std::vector<DataType> _unique(ContainerType &container, DataType *data, std::size_t size) | ||
| { | ||
| for (std::size_t i = 0; i < size; i++) | ||
| container[data[i]] = 0; | ||
|
|
||
| std::vector<DataType> res; | ||
| res.reserve(container.size()); | ||
| for (auto it = container.begin(); it != container.end(); it++) | ||
| res.emplace_back(it->first); | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| template <typename T> | ||
| std::vector<T> unique(std::string &alg, T *data, std::size_t size) | ||
| { | ||
| if (alg == "hash") | ||
| { | ||
| std::unordered_map<T, char> umap; | ||
| return _unique(umap, data, size); | ||
| } | ||
| else if (alg == "rbt") | ||
| { | ||
| std::map<T, char> map; | ||
| return _unique(map, data, size); | ||
| } | ||
| else | ||
| { | ||
| std::cerr << "Unknown algorithm: " << alg << std::endl; | ||
| std::exit(1); | ||
| } | ||
| } | ||
|
|
||
| NPY_NO_EXPORT npy_intp | ||
| PyArray_Unique(PyArrayObject *self) | ||
| template<typename T> | ||
| npy_intp unique(PyArrayObject *self) | ||
| { | ||
| /* Nonzero boolean function */ | ||
| // PyArray_NonzeroFunc* nonzero = PyDataType_GetArrFuncs(PyArray_DESCR(self))->nonzero; | ||
|
|
||
| NpyIter* iter; | ||
| NpyIter_IterNextFunc *iternext; | ||
| char** dataptr; | ||
| npy_intp nonzero_count; | ||
| npy_intp* strideptr,* innersizeptr; | ||
| std::unordered_map<T, char> hashmap; | ||
|
|
||
| /* Handle zero-sized arrays specially */ | ||
| if (PyArray_SIZE(self) == 0) { | ||
| return 0; | ||
| } | ||
|
|
||
| /* | ||
| * Create and use an iterator to count the nonzeros. | ||
| * flag NPY_ITER_READONLY | ||
| * - The array is never written to. | ||
| * flag NPY_ITER_EXTERNAL_LOOP | ||
| * - Inner loop is done outside the iterator for efficiency. | ||
| * flag NPY_ITER_NPY_ITER_REFS_OK | ||
| * - Reference types are acceptable. | ||
| * order NPY_KEEPORDER | ||
| * - Visit elements in memory order, regardless of strides. | ||
| * This is good for performance when the specific order | ||
| * elements are visited is unimportant. | ||
| * casting NPY_NO_CASTING | ||
| * - No casting is required for this operation. | ||
| */ | ||
| iter = NpyIter_New(self, NPY_ITER_READONLY| | ||
| NPY_ITER_EXTERNAL_LOOP| | ||
adrinjalali marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| NPY_ITER_REFS_OK, | ||
|
|
@@ -133,38 +47,51 @@ PyArray_Unique(PyArrayObject *self) | |
| /* The location of the inner loop size which the iterator may update */ | ||
| innersizeptr = NpyIter_GetInnerLoopSizePtr(iter); | ||
|
|
||
| sum = 0; | ||
| std::cout << "printing values: " << std::endl; | ||
| do { | ||
| /* Get the inner loop data/stride/count values */ | ||
| char* data = *dataptr; | ||
| npy_intp stride = *strideptr; | ||
| npy_intp count = *innersizeptr; | ||
| npy_intp size = PyArray_ITEMSIZE(self); | ||
| /* This is a typical inner loop for NPY_ITER_EXTERNAL_LOOP */ | ||
|
|
||
| while (count--) { | ||
| if (nonzero(data, self)) { | ||
| ++nonzero_count; | ||
| } | ||
| std::cout << (T)* data << std::endl; | ||
| hashmap[(T)* data] = 0; | ||
| data += stride; | ||
| } | ||
|
|
||
| /* Increment the iterator to the next inner loop */ | ||
| } while(iternext(iter)); | ||
|
|
||
| NpyIter_Deallocate(iter); | ||
| std::vector<T> res; | ||
| std::cout << "unique values :" << std::endl; | ||
| res.reserve(hashmap.size()); | ||
| for (auto it = hashmap.begin(); it != hashmap.end(); it++) { | ||
| res.emplace_back(it->first); | ||
| std::cout << it->first << std::endl; | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we copy it over manually anyway (and don't already build the result, which makes sense). Then we should allocate the array first. That also ensures we use the custom allocator a user may have set.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if I understand, isn't the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point is that you are using That works, but it is really the wrong way around: We should create the (empty) array with After you move the order and use |
||
|
|
||
| return nonzero_count; | ||
| NpyIter_Deallocate(iter); | ||
| return 0; | ||
| } | ||
|
|
||
| int main(int argc, char *argv[]) | ||
| NPY_NO_EXPORT npy_intp | ||
| PyArray_Unique(PyArrayObject *self) | ||
| { | ||
| std::size_t size, max; | ||
| std::string alg; | ||
| process_args(argc, argv, alg, size, max); | ||
| double sample = 0; | ||
| double *data = random_data(size, max, sample); | ||
| const clock_t begin_time = clock(); | ||
| std::vector<double> unique_values = unique(alg, data, size); | ||
| std::cout << float( clock () - begin_time ) / CLOCKS_PER_SEC; | ||
| delete data; | ||
| npy_intp itemsize; | ||
|
|
||
| /* Handle zero-sized arrays specially */ | ||
| if (PyArray_SIZE(self) == 0) { | ||
| return 0; | ||
| } | ||
|
|
||
| itemsize = PyArray_ITEMSIZE(self); | ||
| std::cout << "Item size: " << itemsize << std::endl; | ||
|
|
||
| if (sizeof(char) == itemsize) { | ||
| unique<char>(self); | ||
| } else if (sizeof(int) == itemsize) { | ||
| unique<int>(self); | ||
| } | ||
| return 0; | ||
| } | ||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just curious, would an unordered_set not be enough?