diff --git a/doc/release/upcoming_changes/28767.change.rst b/doc/release/upcoming_changes/28767.change.rst new file mode 100644 index 000000000000..ec173c3672b0 --- /dev/null +++ b/doc/release/upcoming_changes/28767.change.rst @@ -0,0 +1,10 @@ +``unique_values`` for string dtypes may return unsorted data +------------------------------------------------------------ +np.unique now supports hash‐based duplicate removal for string dtypes. +This enhancement extends the hash-table algorithm to byte strings ('S'), +Unicode strings ('U'), and the experimental string dtype ('T', StringDType). +As a result, calling np.unique() on an array of strings will use +the faster hash-based method to obtain unique values. +Note that this hash-based method does not guarantee that the returned unique values will be sorted. +This also works for StringDType arrays containing None (missing values) +when using equal_nan=True (treating missing values as equal). diff --git a/doc/release/upcoming_changes/28767.performance.rst b/doc/release/upcoming_changes/28767.performance.rst new file mode 100644 index 000000000000..ef8ac1c3a45d --- /dev/null +++ b/doc/release/upcoming_changes/28767.performance.rst @@ -0,0 +1,10 @@ +Performance improvements to ``np.unique`` for string dtypes +----------------------------------------------------------- +The hash-based algorithm for unique extraction provides +an order-of-magnitude speedup on large string arrays. +In an internal benchmark with about 1 billion string elements, +the hash-based np.unique completed in roughly 33.5 seconds, +compared to 498 seconds with the sort-based method +– about 15× faster for unsorted unique operations on strings. +This improvement greatly reduces the time to find unique values +in very large string datasets. diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index a4d2050122c6..4f004dc3ea1f 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -1206,6 +1206,7 @@ src_multiarray = multiarray_gen_headers + [ # Remove this `arm64_exports.c` file once scipy macos arm64 build correctly # links to the arm64 npymath library, see gh-22673 'src/npymath/arm64_exports.c', + 'src/multiarray/fnv.c', ] src_umath = umath_gen_headers + [ diff --git a/numpy/_core/src/multiarray/fnv.c b/numpy/_core/src/multiarray/fnv.c new file mode 100644 index 000000000000..2b7848519e61 --- /dev/null +++ b/numpy/_core/src/multiarray/fnv.c @@ -0,0 +1,85 @@ +/* + FNV-1a hash algorithm implementation + Based on the implementation from: + https://github.com/lcn2/fnv +*/ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE + +#include +#include "numpy/npy_common.h" +#include "fnv.h" + + +#define FNV1A_32_INIT ((npy_uint32)0x811c9dc5) +#define FNV1A_64_INIT ((npy_uint64)0xcbf29ce484222325ULL) + +/* + Compute a 32-bit FNV-1a hash of buffer + original implementation from: + https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_32a.c +*/ +npy_uint32 +npy_fnv1a_32(const void *buf, size_t len, npy_uint32 hval) +{ + const unsigned char *bp = (const unsigned char *)buf; /* start of buffer */ + const unsigned char *be = bp + len; /* beyond end of buffer */ + + /* + FNV-1a hash each octet in the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (npy_uint32)*bp++; + + /* multiply by the 32 bit FNV magic prime */ + /* hval *= 0x01000193; */ + hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24); + } + + return hval; +} + +/* + Compute a 64-bit FNV-1a hash of the given data + original implementation from: + https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_64a.c +*/ +npy_uint64 +npy_fnv1a_64(const void *buf, size_t len, npy_uint64 hval) +{ + const unsigned char *bp = (const unsigned char *)buf; /* start of buffer */ + const unsigned char *be = bp + len; /* beyond end of buffer */ + + /* + FNV-1a hash each octet in the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (npy_uint64)*bp++; + + /* multiply by the 64 bit FNV magic prime */ + /* hval *= 0x100000001b3ULL; */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + + return hval; +} + +/* + * Compute a size_t FNV-1a hash of the given data + * This will use 32-bit or 64-bit hash depending on the size of size_t + */ +size_t +npy_fnv1a(const void *buf, size_t len) +{ +#if NPY_SIZEOF_SIZE_T == 8 + return (size_t)npy_fnv1a_64(buf, len, FNV1A_64_INIT); +#else /* NPY_SIZEOF_SIZE_T == 4 */ + return (size_t)npy_fnv1a_32(buf, len, FNV1A_32_INIT); +#endif +} diff --git a/numpy/_core/src/multiarray/fnv.h b/numpy/_core/src/multiarray/fnv.h new file mode 100644 index 000000000000..c76f54a645b9 --- /dev/null +++ b/numpy/_core/src/multiarray/fnv.h @@ -0,0 +1,26 @@ +/* + FNV-1a hash algorithm implementation + Based on the implementation from: + https://github.com/lcn2/fnv +*/ + +#ifndef NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_ +#define NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_ + + +/* + Compute a size_t FNV-1a hash of the given data + This will use 32-bit or 64-bit hash depending on the size of size_t + + Parameters: + ----------- + buf - pointer to the data to be hashed + len - length of the data in bytes + + Returns: + ----------- + size_t hash value +*/ +size_t npy_fnv1a(const void *buf, size_t len); + +#endif // NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_ diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c index 7724756ba351..955dca01e75d 100644 --- a/numpy/_core/src/multiarray/multiarraymodule.c +++ b/numpy/_core/src/multiarray/multiarraymodule.c @@ -4571,7 +4571,7 @@ static struct PyMethodDef array_module_methods[] = { {"from_dlpack", (PyCFunction)from_dlpack, METH_FASTCALL | METH_KEYWORDS, NULL}, {"_unique_hash", (PyCFunction)array__unique_hash, - METH_O, "Collect unique values via a hash map."}, + METH_FASTCALL | METH_KEYWORDS, "Collect unique values via a hash map."}, {NULL, NULL, 0, NULL} /* sentinel */ }; diff --git a/numpy/_core/src/multiarray/unique.cpp b/numpy/_core/src/multiarray/unique.cpp index f36acfdef49a..636f1ef0137c 100644 --- a/numpy/_core/src/multiarray/unique.cpp +++ b/numpy/_core/src/multiarray/unique.cpp @@ -1,13 +1,21 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define HASH_TABLE_INITIAL_BUCKETS 1024 #include -#include +#include +#include #include +#include #include #include "numpy/arrayobject.h" +#include "gil_utils.h" +extern "C" { + #include "fnv.h" + #include "npy_argparse.h" +} // This is to use RAII pattern to handle cpp exceptions while avoiding memory leaks. // Adapted from https://stackoverflow.com/a/25510879/2536294 @@ -18,77 +26,128 @@ struct FinalAction { private: F clean_; }; - template FinalAction finally(F f) { return FinalAction(f); } -template +template static PyObject* -unique(PyArrayObject *self) +unique_integer(PyArrayObject *self, npy_bool equal_nan) { - /* This function takes a numpy array and returns a numpy array containing - the unique values. - - It assumes the numpy array includes data that can be viewed as unsigned integers - of a certain size (sizeof(T)). - - It doesn't need to know the actual type, since it needs to find unique values - among binary representations of the input data. This means it won't apply to - custom or complicated dtypes or string values. + /* + * Returns a new NumPy array containing the unique values of the input array of integer. + * This function uses hashing to identify uniqueness efficiently. */ NPY_ALLOW_C_API_DEF; - std::unordered_set hashset; - - NpyIter *iter = NpyIter_New(self, NPY_ITER_READONLY | - NPY_ITER_EXTERNAL_LOOP | - NPY_ITER_REFS_OK | - NPY_ITER_ZEROSIZE_OK | - NPY_ITER_GROWINNER, - NPY_KEEPORDER, NPY_NO_CASTING, - NULL); - // Making sure the iterator is deallocated when the function returns, with - // or w/o an exception - auto iter_dealloc = finally([&]() { NpyIter_Deallocate(iter); }); - if (iter == NULL) { - return NULL; + NPY_ALLOW_C_API; + PyArray_Descr *descr = PyArray_DESCR(self); + Py_INCREF(descr); + NPY_DISABLE_C_API; + + PyThreadState *_save1 = PyEval_SaveThread(); + + // number of elements in the input array + npy_intp isize = PyArray_SIZE(self); + + // Reserve hashset capacity in advance to minimize reallocations and collisions. + // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count: + // - Reserving for all elements (isize) may over-allocate when there are few unique values. + // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers). + // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631 + std::unordered_set hashset(std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS)); + + // Input array is one-dimensional, enabling efficient iteration using strides. + char *idata = PyArray_BYTES(self); + npy_intp istride = PyArray_STRIDES(self)[0]; + for (npy_intp i = 0; i < isize; i++, idata += istride) { + hashset.insert(*(T *)idata); } - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (iternext == NULL) { + npy_intp length = hashset.size(); + + PyEval_RestoreThread(_save1); + NPY_ALLOW_C_API; + PyObject *res_obj = PyArray_NewFromDescr( + &PyArray_Type, + descr, + 1, // ndim + &length, // shape + NULL, // strides + NULL, // data + // This flag is needed to be able to call .sort on it. + NPY_ARRAY_WRITEABLE, // flags + NULL // obj + ); + + if (res_obj == NULL) { return NULL; } - char **dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *innersizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - // release the GIL - PyThreadState *_save; - _save = PyEval_SaveThread(); - // Making sure the GIL is re-acquired when the function returns, with - // or w/o an exception - auto grab_gil = finally([&]() { PyEval_RestoreThread(_save); }); - // first we put the data in a hash map - - if (NpyIter_GetIterSize(iter) > 0) { - do { - char* data = *dataptr; - npy_intp stride = *strideptr; - npy_intp count = *innersizeptr; - - while (count--) { - hashset.insert(*((T *) data)); - data += stride; - } - } while (iternext(iter)); + NPY_DISABLE_C_API; + PyThreadState *_save2 = PyEval_SaveThread(); + auto save2_dealloc = finally([&]() { + PyEval_RestoreThread(_save2); + }); + + char *odata = PyArray_BYTES((PyArrayObject *)res_obj); + npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0]; + // Output array is one-dimensional, enabling efficient iteration using strides. + for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) { + *(T *)odata = *it; } - npy_intp length = hashset.size(); + return res_obj; +} +template +static PyObject* +unique_string(PyArrayObject *self, npy_bool equal_nan) +{ + /* + * Returns a new NumPy array containing the unique values of the input array of fixed size strings. + * This function uses hashing to identify uniqueness efficiently. + */ + NPY_ALLOW_C_API_DEF; NPY_ALLOW_C_API; PyArray_Descr *descr = PyArray_DESCR(self); Py_INCREF(descr); + NPY_DISABLE_C_API; + + PyThreadState *_save1 = PyEval_SaveThread(); + + // number of elements in the input array + npy_intp isize = PyArray_SIZE(self); + + // variables for the string + npy_intp itemsize = descr->elsize; + npy_intp num_chars = itemsize / sizeof(T); + auto hash = [num_chars](const T *value) -> size_t { + return npy_fnv1a(value, num_chars * sizeof(T)); + }; + auto equal = [itemsize](const T *lhs, const T *rhs) -> bool { + return std::memcmp(lhs, rhs, itemsize) == 0; + }; + + // Reserve hashset capacity in advance to minimize reallocations and collisions. + // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count: + // - Reserving for all elements (isize) may over-allocate when there are few unique values. + // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers). + // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631 + std::unordered_set hashset( + std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS), hash, equal + ); + + // Input array is one-dimensional, enabling efficient iteration using strides. + char *idata = PyArray_BYTES(self); + npy_intp istride = PyArray_STRIDES(self)[0]; + for (npy_intp i = 0; i < isize; i++, idata += istride) { + hashset.insert((T *)idata); + } + + npy_intp length = hashset.size(); + + PyEval_RestoreThread(_save1); + NPY_ALLOW_C_API; PyObject *res_obj = PyArray_NewFromDescr( &PyArray_Type, descr, @@ -100,18 +159,147 @@ unique(PyArrayObject *self) NPY_ARRAY_WRITEABLE, // flags NULL // obj ); + + if (res_obj == NULL) { + return NULL; + } NPY_DISABLE_C_API; + PyThreadState *_save2 = PyEval_SaveThread(); + auto save2_dealloc = finally([&]() { + PyEval_RestoreThread(_save2); + }); + + char *odata = PyArray_BYTES((PyArrayObject *)res_obj); + npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0]; + // Output array is one-dimensional, enabling efficient iteration using strides. + for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) { + std::memcpy(odata, *it, itemsize); + } + + return res_obj; +} + +static PyObject* +unique_vstring(PyArrayObject *self, npy_bool equal_nan) +{ + /* + * Returns a new NumPy array containing the unique values of the input array. + * This function uses hashing to identify uniqueness efficiently. + */ + NPY_ALLOW_C_API_DEF; + NPY_ALLOW_C_API; + PyArray_Descr *descr = PyArray_DESCR(self); + Py_INCREF(descr); + NPY_DISABLE_C_API; + + PyThreadState *_save1 = PyEval_SaveThread(); + + // number of elements in the input array + npy_intp isize = PyArray_SIZE(self); + + // variables for the vstring + npy_string_allocator *in_allocator = NpyString_acquire_allocator((PyArray_StringDTypeObject *)descr); + auto hash = [equal_nan](const npy_static_string *value) -> size_t { + if (value->buf == NULL) { + if (equal_nan) { + return 0; + } else { + return std::hash{}(value); + } + } + return npy_fnv1a(value->buf, value->size * sizeof(char)); + }; + auto equal = [equal_nan](const npy_static_string *lhs, const npy_static_string *rhs) -> bool { + if (lhs->buf == NULL && rhs->buf == NULL) { + if (equal_nan) { + return true; + } else { + return lhs == rhs; + } + } + if (lhs->buf == NULL || rhs->buf == NULL) { + return false; + } + if (lhs->size != rhs->size) { + return false; + } + return std::memcmp(lhs->buf, rhs->buf, lhs->size) == 0; + }; + // Reserve hashset capacity in advance to minimize reallocations and collisions. + // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count: + // - Reserving for all elements (isize) may over-allocate when there are few unique values. + // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers). + // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631 + std::unordered_set hashset( + std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS), hash, equal + ); + + // Input array is one-dimensional, enabling efficient iteration using strides. + char *idata = PyArray_BYTES(self); + npy_intp istride = PyArray_STRIDES(self)[0]; + // unpacked_strings need to be allocated outside of the loop because of the lifetime problem. + std::vector unpacked_strings(isize, {0, NULL}); + for (npy_intp i = 0; i < isize; i++, idata += istride) { + npy_packed_static_string *packed_string = (npy_packed_static_string *)idata; + int is_null = NpyString_load(in_allocator, packed_string, &unpacked_strings[i]); + if (is_null == -1) { + npy_gil_error(PyExc_RuntimeError, + "Failed to load string from packed static string. "); + return NULL; + } + hashset.insert(&unpacked_strings[i]); + } + + NpyString_release_allocator(in_allocator); + + npy_intp length = hashset.size(); + + PyEval_RestoreThread(_save1); + NPY_ALLOW_C_API; + PyObject *res_obj = PyArray_NewFromDescr( + &PyArray_Type, + descr, + 1, // ndim + &length, // shape + NULL, // strides + NULL, // data + // This flag is needed to be able to call .sort on it. + NPY_ARRAY_WRITEABLE, // flags + NULL // obj + ); if (res_obj == NULL) { return NULL; } + PyArray_Descr *res_descr = PyArray_DESCR((PyArrayObject *)res_obj); + Py_INCREF(res_descr); + NPY_DISABLE_C_API; + + PyThreadState *_save2 = PyEval_SaveThread(); + auto save2_dealloc = finally([&]() { + PyEval_RestoreThread(_save2); + }); + + npy_string_allocator *out_allocator = NpyString_acquire_allocator((PyArray_StringDTypeObject *)res_descr); + auto out_allocator_dealloc = finally([&]() { + NpyString_release_allocator(out_allocator); + }); - // then we iterate through the map's keys to get the unique values - T* data = (T *)PyArray_DATA((PyArrayObject *)res_obj); - auto it = hashset.begin(); - size_t i = 0; - for (; it != hashset.end(); it++, i++) { - data[i] = *it; + char *odata = PyArray_BYTES((PyArrayObject *)res_obj); + npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0]; + // Output array is one-dimensional, enabling efficient iteration using strides. + for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) { + npy_packed_static_string *packed_string = (npy_packed_static_string *)odata; + int pack_status = 0; + if ((*it)->buf == NULL) { + pack_status = NpyString_pack_null(out_allocator, packed_string); + } else { + pack_status = NpyString_pack(out_allocator, packed_string, (*it)->buf, (*it)->size); + } + if (pack_status == -1) { + // string packing failed + return NULL; + } } return res_obj; @@ -119,27 +307,30 @@ unique(PyArrayObject *self) // this map contains the functions used for each item size. -typedef std::function function_type; +typedef std::function function_type; std::unordered_map unique_funcs = { - {NPY_BYTE, unique}, - {NPY_UBYTE, unique}, - {NPY_SHORT, unique}, - {NPY_USHORT, unique}, - {NPY_INT, unique}, - {NPY_UINT, unique}, - {NPY_LONG, unique}, - {NPY_ULONG, unique}, - {NPY_LONGLONG, unique}, - {NPY_ULONGLONG, unique}, - {NPY_INT8, unique}, - {NPY_INT16, unique}, - {NPY_INT32, unique}, - {NPY_INT64, unique}, - {NPY_UINT8, unique}, - {NPY_UINT16, unique}, - {NPY_UINT32, unique}, - {NPY_UINT64, unique}, - {NPY_DATETIME, unique}, + {NPY_BYTE, unique_integer}, + {NPY_UBYTE, unique_integer}, + {NPY_SHORT, unique_integer}, + {NPY_USHORT, unique_integer}, + {NPY_INT, unique_integer}, + {NPY_UINT, unique_integer}, + {NPY_LONG, unique_integer}, + {NPY_ULONG, unique_integer}, + {NPY_LONGLONG, unique_integer}, + {NPY_ULONGLONG, unique_integer}, + {NPY_INT8, unique_integer}, + {NPY_INT16, unique_integer}, + {NPY_INT32, unique_integer}, + {NPY_INT64, unique_integer}, + {NPY_UINT8, unique_integer}, + {NPY_UINT16, unique_integer}, + {NPY_UINT32, unique_integer}, + {NPY_UINT64, unique_integer}, + {NPY_DATETIME, unique_integer}, + {NPY_STRING, unique_string}, + {NPY_UNICODE, unique_string}, + {NPY_VSTRING, unique_vstring}, }; @@ -154,14 +345,21 @@ std::unordered_map unique_funcs = { * type is unsupported or `NULL` with an error set. */ extern "C" NPY_NO_EXPORT PyObject * -array__unique_hash(PyObject *NPY_UNUSED(module), PyObject *arr_obj) +array__unique_hash(PyObject *NPY_UNUSED(module), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) { - if (!PyArray_Check(arr_obj)) { - PyErr_SetString(PyExc_TypeError, - "_unique_hash() requires a NumPy array input."); + PyArrayObject *arr = NULL; + npy_bool equal_nan = NPY_TRUE; // default to True + + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("_unique_hash", args, len_args, kwnames, + "arr", &PyArray_Converter, &arr, + "|equal_nan", &PyArray_BoolConverter, &equal_nan, + NULL, NULL, NULL + ) < 0 + ) { return NULL; } - PyArrayObject *arr = (PyArrayObject *)arr_obj; try { auto type = PyArray_TYPE(arr); @@ -170,7 +368,7 @@ array__unique_hash(PyObject *NPY_UNUSED(module), PyObject *arr_obj) Py_RETURN_NOTIMPLEMENTED; } - return unique_funcs[type](arr); + return unique_funcs[type](arr, equal_nan); } catch (const std::bad_alloc &e) { PyErr_NoMemory(); diff --git a/numpy/_core/src/multiarray/unique.h b/numpy/_core/src/multiarray/unique.h index 3e258405e8f4..7b3fb143ada4 100644 --- a/numpy/_core/src/multiarray/unique.h +++ b/numpy/_core/src/multiarray/unique.h @@ -5,7 +5,8 @@ extern "C" { #endif -PyObject* array__unique_hash(PyObject *NPY_UNUSED(dummy), PyObject *args); +PyObject* array__unique_hash(PyObject *NPY_UNUSED(dummy), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames); #ifdef __cplusplus } diff --git a/numpy/lib/_arraysetops_impl.py b/numpy/lib/_arraysetops_impl.py index ef0739ba486f..c4788385b924 100644 --- a/numpy/lib/_arraysetops_impl.py +++ b/numpy/lib/_arraysetops_impl.py @@ -368,7 +368,8 @@ def _unique1d(ar, return_index=False, return_inverse=False, conv = _array_converter(ar) ar_, = conv - if (hash_unique := _unique_hash(ar_)) is not NotImplemented: + if (hash_unique := _unique_hash(ar_, equal_nan=equal_nan)) \ + is not NotImplemented: if sorted: hash_unique.sort() # We wrap the result back in case it was a subclass of numpy.ndarray. diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py index 7865e1b16ee9..b3e2bfa279b0 100644 --- a/numpy/lib/tests/test_arraysetops.py +++ b/numpy/lib/tests/test_arraysetops.py @@ -5,6 +5,7 @@ import numpy as np from numpy import ediff1d, intersect1d, isin, setdiff1d, setxor1d, union1d, unique +from numpy.dtypes import StringDType from numpy.exceptions import AxisError from numpy.testing import ( assert_array_equal, @@ -813,7 +814,9 @@ def test_unique_1d(self): def test_unique_zero_sized(self): # test for zero-sized arrays - for dt in self.get_types(): + types = self.get_types() + types.extend('SU') + for dt in types: a = np.array([], dt) b = np.array([], dt) i1 = np.array([], np.int64) @@ -838,6 +841,187 @@ class Subclass(np.ndarray): bb = Subclass(b.shape, dtype=dt, buffer=b) self.check_all(aa, bb, i1, i2, c, dt) + def test_unique_byte_string_hash_based(self): + # test for byte string arrays + arr = ['apple', 'banana', 'apple', 'cherry', 'date', 'banana', 'fig', 'grape'] + unq_sorted = ['apple', 'banana', 'cherry', 'date', 'fig', 'grape'] + + a1 = unique(arr, sorted=False) + # the result varies depending on the impl of std::unordered_set, + # so we check them by sorting + assert_array_equal(sorted(a1.tolist()), unq_sorted) + + def test_unique_unicode_string_hash_based(self): + # test for unicode string arrays + arr = [ + 'café', 'cafe', 'café', 'naïve', 'naive', + 'résumé', 'naïve', 'resume', 'résumé', + ] + unq_sorted = ['cafe', 'café', 'naive', 'naïve', 'resume', 'résumé'] + + a1 = unique(arr, sorted=False) + # the result varies depending on the impl of std::unordered_set, + # so we check them by sorting + assert_array_equal(sorted(a1.tolist()), unq_sorted) + + def test_unique_vstring_hash_based_equal_nan(self): + # test for unicode and nullable string arrays (equal_nan=True) + a = np.array([ + # short strings + 'straße', + None, + 'strasse', + 'straße', + None, + 'niño', + 'nino', + 'élève', + 'eleve', + 'niño', + 'élève', + # medium strings + 'b' * 20, + 'ß' * 30, + None, + 'é' * 30, + 'e' * 20, + 'ß' * 30, + 'n' * 30, + 'ñ' * 20, + None, + 'e' * 20, + 'ñ' * 20, + # long strings + 'b' * 300, + 'ß' * 400, + None, + 'é' * 400, + 'e' * 300, + 'ß' * 400, + 'n' * 400, + 'ñ' * 300, + None, + 'e' * 300, + 'ñ' * 300, + ], + dtype=StringDType(na_object=None) + ) + unq_sorted_wo_none = [ + 'b' * 20, + 'b' * 300, + 'e' * 20, + 'e' * 300, + 'eleve', + 'nino', + 'niño', + 'n' * 30, + 'n' * 400, + 'strasse', + 'straße', + 'ß' * 30, + 'ß' * 400, + 'élève', + 'é' * 30, + 'é' * 400, + 'ñ' * 20, + 'ñ' * 300, + ] + + a1 = unique(a, sorted=False, equal_nan=True) + # the result varies depending on the impl of std::unordered_set, + # so we check them by sorting + + # a1 should have exactly one None + count_none = sum(x is None for x in a1) + assert_equal(count_none, 1) + + a1_wo_none = sorted(x for x in a1 if x is not None) + assert_array_equal(a1_wo_none, unq_sorted_wo_none) + + def test_unique_vstring_hash_based_not_equal_nan(self): + # test for unicode and nullable string arrays (equal_nan=False) + a = np.array([ + # short strings + 'straße', + None, + 'strasse', + 'straße', + None, + 'niño', + 'nino', + 'élève', + 'eleve', + 'niño', + 'élève', + # medium strings + 'b' * 20, + 'ß' * 30, + None, + 'é' * 30, + 'e' * 20, + 'ß' * 30, + 'n' * 30, + 'ñ' * 20, + None, + 'e' * 20, + 'ñ' * 20, + # long strings + 'b' * 300, + 'ß' * 400, + None, + 'é' * 400, + 'e' * 300, + 'ß' * 400, + 'n' * 400, + 'ñ' * 300, + None, + 'e' * 300, + 'ñ' * 300, + ], + dtype=StringDType(na_object=None) + ) + unq_sorted_wo_none = [ + 'b' * 20, + 'b' * 300, + 'e' * 20, + 'e' * 300, + 'eleve', + 'nino', + 'niño', + 'n' * 30, + 'n' * 400, + 'strasse', + 'straße', + 'ß' * 30, + 'ß' * 400, + 'élève', + 'é' * 30, + 'é' * 400, + 'ñ' * 20, + 'ñ' * 300, + ] + + a1 = unique(a, sorted=False, equal_nan=False) + # the result varies depending on the impl of std::unordered_set, + # so we check them by sorting + + # a1 should have exactly one None + count_none = sum(x is None for x in a1) + assert_equal(count_none, 6) + + a1_wo_none = sorted(x for x in a1 if x is not None) + assert_array_equal(a1_wo_none, unq_sorted_wo_none) + + def test_unique_vstring_errors(self): + a = np.array( + [ + 'apple', 'banana', 'apple', None, 'cherry', + 'date', 'banana', 'fig', None, 'grape', + ] * 2, + dtype=StringDType(na_object=None) + ) + assert_raises(ValueError, unique, a, equal_nan=False) + @pytest.mark.parametrize("arg", ["return_index", "return_inverse", "return_counts"]) def test_unsupported_hash_based(self, arg): """These currently never use the hash-based solution. However,