Merge pull request #29621 from math-hiyoko/enh/extend_unique_benchmark

mattip · web-flow · commit e099d0549498 · 2025-09-07T12:00:21.000+03:00
ENH: Extend coverage for benchmark of np.unique
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
@@ -1,5 +1,6 @@
 """Benchmarks for `numpy.lib`."""
 
+import string
 
 import numpy as np
 
@@ -119,37 +120,71 @@ def time_nanpercentile(self, array_size, percent_nans):
 class Unique(Benchmark):
     """Benchmark for np.unique with np.nan values."""
 
-    param_names = ["array_size", "percent_nans"]
+    param_names = ["array_size", "percent_nans", "percent_unique_values", "dtype"]
     params = [
         # sizes of the 1D arrays
         [200, int(2e5)],
         # percent of np.nan in arrays
-        [0, 0.1, 2., 50., 90.],
+        [10., 90.],
+        # percent of unique values in arrays
+        [0.2, 20.],
+        # dtypes of the arrays
+        [np.float64, np.complex128, np.dtypes.StringDType(na_object=np.nan)],
     ]
 
-    def setup(self, array_size, percent_nans):
-        np.random.seed(123)
+    def setup(self, array_size, percent_nans, percent_unique_values, dtype):
+        rng = np.random.default_rng(123)
         # produce a randomly shuffled array with the
         # approximate desired percentage np.nan content
-        base_array = np.random.uniform(size=array_size)
-        n_nan = int(percent_nans * array_size)
-        nan_indices = np.random.choice(np.arange(array_size), size=n_nan)
+        unique_values_size = max(int(percent_unique_values / 100. * array_size), 2)
+        match dtype:
+            case np.float64:
+                unique_array = rng.uniform(size=unique_values_size).astype(dtype)
+            case np.complex128:
+                unique_array = np.array(
+                    [
+                        complex(*rng.uniform(size=2))
+                        for _ in range(unique_values_size)
+                    ],
+                    dtype=dtype,
+                )
+            case np.dtypes.StringDType():
+                chars = string.ascii_letters + string.digits
+                unique_array = np.array(
+                    [
+                        ''.join(rng.choice(list(chars), size=rng.integers(4, 8)))
+                        for _ in range(unique_values_size)
+                    ],
+                    dtype=dtype,
+                )
+            case _:
+                raise ValueError(f"Unsupported dtype {dtype}")
+
+        base_array = np.resize(unique_array, array_size)
+        rng.shuffle(base_array)
+        # insert nans in random places
+        n_nan = int(percent_nans / 100. * array_size)
+        nan_indices = rng.choice(np.arange(array_size), size=n_nan, replace=False)
         base_array[nan_indices] = np.nan
         self.arr = base_array
 
-    def time_unique_values(self, array_size, percent_nans):
+    def time_unique_values(self, array_size, percent_nans,
+                           percent_unique_values, dtype):
         np.unique(self.arr, return_index=False,
                   return_inverse=False, return_counts=False)
 
-    def time_unique_counts(self, array_size, percent_nans):
+    def time_unique_counts(self, array_size, percent_nans,
+                           percent_unique_values, dtype):
         np.unique(self.arr, return_index=False,
-                  return_inverse=False, return_counts=True)
+                  return_inverse=False, return_counts=True,)
 
-    def time_unique_inverse(self, array_size, percent_nans):
+    def time_unique_inverse(self, array_size, percent_nans,
+                            percent_unique_values, dtype):
         np.unique(self.arr, return_index=False,
                   return_inverse=True, return_counts=False)
 
-    def time_unique_all(self, array_size, percent_nans):
+    def time_unique_all(self, array_size, percent_nans,
+                        percent_unique_values, dtype):
         np.unique(self.arr, return_index=True,
                   return_inverse=True, return_counts=True)