From 1d63c0e57b90ff9a24104a70bfded96b1d8d5c93 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 18 Oct 2022 17:19:40 -0400
Subject: [PATCH 1/8] Extended outlier encoding to dbscan_clustering method

---
 sklearn/cluster/_hdbscan/hdbscan.py   |  8 +++++++-
 sklearn/cluster/tests/test_hdbscan.py | 27 ++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 79beead943898..04844de4708fd 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -767,9 +767,15 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
             An array of cluster labels, one per datapoint. Unclustered points
             are assigned the label -1.
         """
-        return labelling_at_cut(
+        labels = labelling_at_cut(
             self._single_linkage_tree_, cut_distance, min_cluster_size
         )
+        infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"]
+        missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"]
+
+        labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
+        labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
+        return labels
 
     def _more_tags(self):
         return {"allow_nan": self.metric != "precomputed"}
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 65f9829ffef5c..48ce60dc05a0d 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -143,12 +143,37 @@ def test_hdbscan_algorithms(algo, metric):
         hdb.fit(X)
 
 
-def test_hdbscan_dbscan_clustering():
+def test_dbscan_clustering():
     clusterer = HDBSCAN().fit(X)
     labels = clusterer.dbscan_clustering(0.3)
     n_clusters = len(set(labels) - OUTLIER_SET)
     assert n_clusters == n_clusters_true
 
+@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
+@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
+def test_dbscan_clustering_outlier_data(outlier_type, cut_distance):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    outlier = {
+        "infinite": np.inf,
+        "missing": np.nan,
+    }[outlier_type]
+    label = _OUTLIER_ENCODING[outlier_type]["label"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [outlier, 1]
+    X_outlier[5] = [outlier, outlier]
+    model = HDBSCAN().fit(X_outlier)
+    labels = model.dbscan_clustering(cut_distance=cut_distance)
+
+    (missing_labels_idx,) = (labels == label).nonzero()
+    assert_array_equal(missing_labels_idx, [0, 5])
+
+    clean_indices = list(range(1, 5)) + list(range(6, 200))
+    clean_model = HDBSCAN().fit(X_outlier[clean_indices])
+    clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
+    assert_array_equal(clean_labels, labels[clean_indices])
 
 def test_hdbscan_high_dimensional():
     H, y = make_blobs(n_samples=50, random_state=0, n_features=64)

From edfa11d51c6024ddb6132b9d135522477c87204d Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 18 Oct 2022 17:49:54 -0400
Subject: [PATCH 2/8] Improved documentation

---
 sklearn/cluster/_hdbscan/hdbscan.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 04844de4708fd..a7487db970412 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -27,7 +27,8 @@
 
 FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics
 
-# Encodings are arbitray, but chosen as extensions to the -1 noise label.
+# Encodings are arbitray but must be strictly negative.
+# The current encodings are chosen as extensions to the -1 noise label.
 # Avoided enums so that the end user only deals with simple labels.
 _OUTLIER_ENCODING = {
     "infinite": {
@@ -764,15 +765,20 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
         Returns
         -------
         labels : ndarray of shape (n_samples,)
-            An array of cluster labels, one per datapoint. Unclustered points
-            are assigned the label -1.
+            An array of cluster labels, one per datapoint. 
+            Outliers are labeled as follows:
+            - Noisy samples are given the label -1.
+            - Samples with infinite elements (+/- np.inf) are given the label -2.
+            - Samples with missing data are given the label -3.            
         """
         labels = labelling_at_cut(
             self._single_linkage_tree_, cut_distance, min_cluster_size
         )
+        # Infer indices from labels generated during `fit`
         infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"]
         missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"]
 
+        # Overwrite infinite/missing outlier samples (otherwise simple noise)
         labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
         labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
         return labels

From 31e4ac290ccb48437b802fa56ba17f54dd5e6fe8 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 18 Oct 2022 18:56:09 -0400
Subject: [PATCH 3/8] Black

---
 sklearn/cluster/_hdbscan/hdbscan.py   | 4 ++--
 sklearn/cluster/tests/test_hdbscan.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index a7487db970412..4a186efa4d3a6 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -765,11 +765,11 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
         Returns
         -------
         labels : ndarray of shape (n_samples,)
-            An array of cluster labels, one per datapoint. 
+            An array of cluster labels, one per datapoint.
             Outliers are labeled as follows:
             - Noisy samples are given the label -1.
             - Samples with infinite elements (+/- np.inf) are given the label -2.
-            - Samples with missing data are given the label -3.            
+            - Samples with missing data are given the label -3.
         """
         labels = labelling_at_cut(
             self._single_linkage_tree_, cut_distance, min_cluster_size
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 48ce60dc05a0d..dc1fd5248201f 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -149,6 +149,7 @@ def test_dbscan_clustering():
     n_clusters = len(set(labels) - OUTLIER_SET)
     assert n_clusters == n_clusters_true
 
+
 @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
 @pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
 def test_dbscan_clustering_outlier_data(outlier_type, cut_distance):
@@ -175,6 +176,7 @@ def test_dbscan_clustering_outlier_data(outlier_type, cut_distance):
     clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
     assert_array_equal(clean_labels, labels[clean_indices])
 
+
 def test_hdbscan_high_dimensional():
     H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
     H = StandardScaler().fit_transform(H)

From 10355aa8c1c054aba79bdf804e067053a0ac6c56 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Sat, 5 Nov 2022 14:51:37 -0400
Subject: [PATCH 4/8] Updated test and clarified condition with both
 missing/inf data

---
 sklearn/cluster/_hdbscan/hdbscan.py   |  9 ++++++---
 sklearn/cluster/tests/test_hdbscan.py | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 4a186efa4d3a6..579d5ea89824e 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -362,7 +362,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         Outliers are labeled as follows:
         - Noisy samples are given the label -1.
         - Samples with infinite elements (+/- np.inf) are given the label -2.
-        - Samples with missing data are given the label -3.
+        - Samples with missing data are given the label -3, even if they
+          also have infinite elements.
 
     probabilities_ : ndarray of shape (n_samples,)
         The strength with which each sample is a member of its assigned
@@ -664,7 +665,8 @@ def fit(self, X, y=None):
             self._single_linkage_tree_ = remap_single_linkage_tree(
                 self._single_linkage_tree_,
                 internal_to_raw,
-                non_finite=infinite_index + missing_index,
+                # There may be overlap for points w/ both `np.inf` and `np.nan`
+                non_finite=set(infinite_index + missing_index),
             )
             new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32)
             new_labels[finite_index] = self.labels_
@@ -769,7 +771,8 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
             Outliers are labeled as follows:
             - Noisy samples are given the label -1.
             - Samples with infinite elements (+/- np.inf) are given the label -2.
-            - Samples with missing data are given the label -3.
+            - Samples with missing data are given the label -3, even if they
+              also have infinite elements.
         """
         labels = labelling_at_cut(
             self._single_linkage_tree_, cut_distance, min_cluster_size
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index dc1fd5248201f..7949c3686c787 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -150,26 +150,26 @@ def test_dbscan_clustering():
     assert n_clusters == n_clusters_true
 
 
-@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
 @pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
-def test_dbscan_clustering_outlier_data(outlier_type, cut_distance):
+def test_dbscan_clustering_outlier_data(cut_distance):
     """
     Tests if np.inf and np.nan data are each treated as special outliers.
     """
-    outlier = {
-        "infinite": np.inf,
-        "missing": np.nan,
-    }[outlier_type]
-    label = _OUTLIER_ENCODING[outlier_type]["label"]
+    missing_label = _OUTLIER_ENCODING["missing"]["label"]
+    infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
 
     X_outlier = X.copy()
-    X_outlier[0] = [outlier, 1]
-    X_outlier[5] = [outlier, outlier]
+    X_outlier[0] = [np.inf, 1]
+    X_outlier[2] = [1, np.nan]
+    X_outlier[5] = [np.inf, np.nan]
     model = HDBSCAN().fit(X_outlier)
     labels = model.dbscan_clustering(cut_distance=cut_distance)
 
-    (missing_labels_idx,) = (labels == label).nonzero()
-    assert_array_equal(missing_labels_idx, [0, 5])
+    (missing_labels_idx,) = (labels == missing_label).nonzero()
+    assert_array_equal(missing_labels_idx, [2, 5])
+
+    (infinite_labels_idx,) = (labels == infinite_label).nonzero()
+    assert_array_equal(infinite_labels_idx, [0])
 
     clean_indices = list(range(1, 5)) + list(range(6, 200))
     clean_model = HDBSCAN().fit(X_outlier[clean_indices])

From 731612d0ad039f03821e8d80b8f58229acf0fc11 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Tue, 8 Nov 2022 12:12:20 -0500
Subject: [PATCH 5/8] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/cluster/tests/test_hdbscan.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 7949c3686c787..dc37e8d5ef8f2 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -165,10 +165,10 @@ def test_dbscan_clustering_outlier_data(cut_distance):
     model = HDBSCAN().fit(X_outlier)
     labels = model.dbscan_clustering(cut_distance=cut_distance)
 
-    (missing_labels_idx,) = (labels == missing_label).nonzero()
+    missing_labels_idx = np.flatnonzero(labels == missing_label)
     assert_array_equal(missing_labels_idx, [2, 5])
 
-    (infinite_labels_idx,) = (labels == infinite_label).nonzero()
+    infinite_labels_idx = np.flatnonzero(labels == infinite_label)
     assert_array_equal(infinite_labels_idx, [0])
 
     clean_indices = list(range(1, 5)) + list(range(6, 200))

From ebb72b407c4d6f8f3be8aeb98e938be52459790a Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Tue, 7 Feb 2023 11:18:50 -0500
Subject: [PATCH 6/8] Update sklearn/cluster/_hdbscan/hdbscan.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/cluster/_hdbscan/hdbscan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 38e803313fc3b..ae3645c88dccb 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -32,7 +32,7 @@
 
 FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics
 
-# Encodings are arbitray but must be strictly negative.
+# Encodings are arbitrary but must be strictly negative.
 # The current encodings are chosen as extensions to the -1 noise label.
 # Avoided enums so that the end user only deals with simple labels.
 _OUTLIER_ENCODING = {

From 3db72616cdf6d607ef4b3f11a8209c7dcf43b85c Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 8 Feb 2023 18:33:45 -0500
Subject: [PATCH 7/8] Corrected/improved test

---
 sklearn/cluster/tests/test_hdbscan.py         |   6 +-
 .../_argkminclassmode.pyx                     | 385 ++++++++++++++++++
 2 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index dc37e8d5ef8f2..8181c0d2cb218 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -171,10 +171,10 @@ def test_dbscan_clustering_outlier_data(cut_distance):
     infinite_labels_idx = np.flatnonzero(labels == infinite_label)
     assert_array_equal(infinite_labels_idx, [0])
 
-    clean_indices = list(range(1, 5)) + list(range(6, 200))
-    clean_model = HDBSCAN().fit(X_outlier[clean_indices])
+    clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
+    clean_model = HDBSCAN().fit(X_outlier[clean_idx])
     clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
-    assert_array_equal(clean_labels, labels[clean_indices])
+    assert_array_equal(clean_labels, labels[clean_idx])
 
 
 def test_hdbscan_high_dimensional():
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx
new file mode 100644
index 0000000000000..102f7c99059b5
--- /dev/null
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx
@@ -0,0 +1,385 @@
+
+from cython cimport floating, integral
+from cython.parallel cimport parallel, prange
+from libcpp.map cimport map as cpp_map, pair as cpp_pair
+from libc.stdlib cimport free
+
+cimport numpy as cnp
+
+cnp.import_array()
+
+from ...utils._typedefs cimport ITYPE_t, DTYPE_t
+from ...utils._typedefs import ITYPE, DTYPE
+from ...utils._sorting cimport simultaneous_sort
+import numpy as np
+from scipy.sparse import issparse
+from sklearn.utils.fixes import threadpool_limits
+
+cpdef enum WeightingStrategy:
+    uniform = 0
+    # TODO: Implement the following options, most likely in
+    # `weighted_histogram_mode`
+    distance = 1
+    callable = 2
+from ._argkmin cimport ArgKmin64
+from ._datasets_pair cimport DatasetsPair64
+
+cdef class ArgKminClassMode64(ArgKmin64):
+    """
+    64bit implementation of ArgKminClassMode.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        const ITYPE_t[:] unique_labels
+        DTYPE_t[:, :] label_weights
+        cpp_map[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        weights,
+        labels,
+        unique_labels,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        """Compute the argkmin reduction with labels.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKminClassMode64`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance _must_ directly be created outside of this class method.
+        """
+        # Use a generic implementation that handles most scipy
+        # metrics by computing the distances between 2 vectors at a time.
+        pda = ArgKminClassMode64(
+            datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            weights=weights,
+            labels=labels,
+            unique_labels=unique_labels,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair64 datasets_pair,
+        const ITYPE_t[:] labels,
+        const ITYPE_t[:] unique_labels,
+        chunk_size=None,
+        strategy=None,
+        ITYPE_t k=1,
+        weights=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.callable
+        self.labels = labels
+
+        self.unique_labels = unique_labels
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros(
+            (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE,
+        )
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+            # TODO: Implement other WeightingStrategy values
+            bint use_distance_weighting = (
+                self.weight_type == WeightingStrategy.distance
+            )
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            # TODO: inspect if it worth permuting this condition
+            # and the for-loop above for improved branching.
+            if use_distance_weighting:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            self.label_weights[sample_index][label] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return
+from ._argkmin cimport ArgKmin32
+from ._datasets_pair cimport DatasetsPair32
+
+cdef class ArgKminClassMode32(ArgKmin32):
+    """
+    32bit implementation of ArgKminClassMode.
+    """
+    cdef:
+        const ITYPE_t[:] labels,
+        const ITYPE_t[:] unique_labels
+        DTYPE_t[:, :] label_weights
+        cpp_map[ITYPE_t, ITYPE_t] labels_to_index
+        WeightingStrategy weight_type
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        ITYPE_t k,
+        weights,
+        labels,
+        unique_labels,
+        str metric="euclidean",
+        chunk_size=None,
+        dict metric_kwargs=None,
+        str strategy=None,
+    ):
+        """Compute the argkmin reduction with labels.
+
+        This classmethod is responsible for introspecting the arguments
+        values to dispatch to the most appropriate implementation of
+        :class:`ArgKminClassMode32`.
+
+        This allows decoupling the API entirely from the implementation details
+        whilst maintaining RAII: all temporarily allocated datastructures necessary
+        for the concrete implementation are therefore freed when this classmethod
+        returns.
+
+        No instance _must_ directly be created outside of this class method.
+        """
+        # Use a generic implementation that handles most scipy
+        # metrics by computing the distances between 2 vectors at a time.
+        pda = ArgKminClassMode32(
+            datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs),
+            k=k,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            weights=weights,
+            labels=labels,
+            unique_labels=unique_labels,
+        )
+
+        # Limit the number of threads in second level of nested parallelism for BLAS
+        # to avoid threads over-subscription (in GEMM for instance).
+        with threadpool_limits(limits=1, user_api="blas"):
+            if pda.execute_in_parallel_on_Y:
+                pda._parallel_on_Y()
+            else:
+                pda._parallel_on_X()
+
+        return pda._finalize_results()
+
+    def __init__(
+        self,
+        DatasetsPair32 datasets_pair,
+        const ITYPE_t[:] labels,
+        const ITYPE_t[:] unique_labels,
+        chunk_size=None,
+        strategy=None,
+        ITYPE_t k=1,
+        weights=None,
+    ):
+        super().__init__(
+            datasets_pair=datasets_pair,
+            chunk_size=chunk_size,
+            strategy=strategy,
+            k=k,
+        )
+
+        if weights == "uniform":
+            self.weight_type = WeightingStrategy.uniform
+        elif weights == "distance":
+            self.weight_type = WeightingStrategy.distance
+        else:
+            self.weight_type = WeightingStrategy.callable
+        self.labels = labels
+
+        self.unique_labels = unique_labels
+
+        cdef ITYPE_t idx, label
+        # Map from set of unique labels to their indices in `label_weights`
+        # Buffer used in building a histogram for one-pass weighted mode
+        self.label_weights = np.zeros(
+            (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE,
+        )
+
+    def _finalize_results(self):
+        probabilities = np.asarray(self.label_weights)
+        probabilities /= probabilities.sum(axis=1, keepdims=True)
+        return probabilities
+
+    cdef inline void weighted_histogram_mode(
+        self,
+        ITYPE_t sample_index,
+        ITYPE_t* indices,
+        DTYPE_t* distances,
+   ) nogil:
+        cdef:
+            ITYPE_t y_idx, label, label_index, multi_output_index
+            DTYPE_t label_weight = 1
+            # TODO: Implement other WeightingStrategy values
+            bint use_distance_weighting = (
+                self.weight_type == WeightingStrategy.distance
+            )
+
+        # Iterate through the sample k-nearest neighbours
+        for jdx in range(self.k):
+            # Absolute indice of the jdx-th Nearest Neighbors
+            # in range [0, n_samples_Y)
+            # TODO: inspect if it worth permuting this condition
+            # and the for-loop above for improved branching.
+            if use_distance_weighting:
+                label_weight = 1 / distances[jdx]
+            y_idx = indices[jdx]
+            label = self.labels[y_idx]
+            self.label_weights[sample_index][label] += label_weight
+        return
+
+    cdef void _parallel_on_X_prange_iter_finalize(
+        self,
+        ITYPE_t thread_num,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, sample_index
+        # Sorting the main heaps portion associated to `X[X_start:X_end]`
+        # in ascending order w.r.t the distances.
+        for idx in range(X_end - X_start):
+            simultaneous_sort(
+                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
+                self.heaps_indices_chunks[thread_num] + idx * self.k,
+                self.k
+            )
+            # One-pass top-one weighted mode
+            # Compute the absolute index in [0, n_samples_X)
+            sample_index = X_start + idx
+            self.weighted_histogram_mode(
+                sample_index,
+                &self.heaps_indices_chunks[thread_num][0],
+                &self.heaps_r_distances_chunks[thread_num][0],
+            )
+        return
+
+    cdef void _parallel_on_Y_finalize(
+        self,
+    ) nogil:
+        cdef:
+            ITYPE_t sample_index, thread_num
+
+        with nogil, parallel(num_threads=self.chunks_n_threads):
+            # Deallocating temporary datastructures
+            for thread_num in prange(self.chunks_n_threads, schedule='static'):
+                free(self.heaps_r_distances_chunks[thread_num])
+                free(self.heaps_indices_chunks[thread_num])
+
+            # Sorting the main in ascending order w.r.t the distances.
+            # This is done in parallel sample-wise (no need for locks).
+            for sample_index in prange(self.n_samples_X, schedule='static'):
+                simultaneous_sort(
+                    &self.argkmin_distances[sample_index, 0],
+                    &self.argkmin_indices[sample_index, 0],
+                    self.k,
+                )
+                self.weighted_histogram_mode(
+                    sample_index,
+                    &self.argkmin_indices[sample_index][0],
+                    &self.argkmin_distances[sample_index][0],
+                )
+        return

From eb263f0c8092ea1e07a45c645adc5050bb358ded Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 8 Feb 2023 18:34:52 -0500
Subject: [PATCH 8/8] Removed accidental inclusion

---
 .../_argkminclassmode.pyx                     | 385 ------------------
 1 file changed, 385 deletions(-)
 delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx
deleted file mode 100644
index 102f7c99059b5..0000000000000
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx
+++ /dev/null
@@ -1,385 +0,0 @@
-
-from cython cimport floating, integral
-from cython.parallel cimport parallel, prange
-from libcpp.map cimport map as cpp_map, pair as cpp_pair
-from libc.stdlib cimport free
-
-cimport numpy as cnp
-
-cnp.import_array()
-
-from ...utils._typedefs cimport ITYPE_t, DTYPE_t
-from ...utils._typedefs import ITYPE, DTYPE
-from ...utils._sorting cimport simultaneous_sort
-import numpy as np
-from scipy.sparse import issparse
-from sklearn.utils.fixes import threadpool_limits
-
-cpdef enum WeightingStrategy:
-    uniform = 0
-    # TODO: Implement the following options, most likely in
-    # `weighted_histogram_mode`
-    distance = 1
-    callable = 2
-from ._argkmin cimport ArgKmin64
-from ._datasets_pair cimport DatasetsPair64
-
-cdef class ArgKminClassMode64(ArgKmin64):
-    """
-    64bit implementation of ArgKminClassMode.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        const ITYPE_t[:] unique_labels
-        DTYPE_t[:, :] label_weights
-        cpp_map[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        ITYPE_t k,
-        weights,
-        labels,
-        unique_labels,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-    ):
-        """Compute the argkmin reduction with labels.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`ArgKminClassMode64`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance _must_ directly be created outside of this class method.
-        """
-        # Use a generic implementation that handles most scipy
-        # metrics by computing the distances between 2 vectors at a time.
-        pda = ArgKminClassMode64(
-            datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs),
-            k=k,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
-        )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results()
-
-    def __init__(
-        self,
-        DatasetsPair64 datasets_pair,
-        const ITYPE_t[:] labels,
-        const ITYPE_t[:] unique_labels,
-        chunk_size=None,
-        strategy=None,
-        ITYPE_t k=1,
-        weights=None,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            k=k,
-        )
-
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.callable
-        self.labels = labels
-
-        self.unique_labels = unique_labels
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros(
-            (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE,
-        )
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-            # TODO: Implement other WeightingStrategy values
-            bint use_distance_weighting = (
-                self.weight_type == WeightingStrategy.distance
-            )
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            # TODO: inspect if it worth permuting this condition
-            # and the for-loop above for improved branching.
-            if use_distance_weighting:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            self.label_weights[sample_index][label] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return
-from ._argkmin cimport ArgKmin32
-from ._datasets_pair cimport DatasetsPair32
-
-cdef class ArgKminClassMode32(ArgKmin32):
-    """
-    32bit implementation of ArgKminClassMode.
-    """
-    cdef:
-        const ITYPE_t[:] labels,
-        const ITYPE_t[:] unique_labels
-        DTYPE_t[:, :] label_weights
-        cpp_map[ITYPE_t, ITYPE_t] labels_to_index
-        WeightingStrategy weight_type
-
-    @classmethod
-    def compute(
-        cls,
-        X,
-        Y,
-        ITYPE_t k,
-        weights,
-        labels,
-        unique_labels,
-        str metric="euclidean",
-        chunk_size=None,
-        dict metric_kwargs=None,
-        str strategy=None,
-    ):
-        """Compute the argkmin reduction with labels.
-
-        This classmethod is responsible for introspecting the arguments
-        values to dispatch to the most appropriate implementation of
-        :class:`ArgKminClassMode32`.
-
-        This allows decoupling the API entirely from the implementation details
-        whilst maintaining RAII: all temporarily allocated datastructures necessary
-        for the concrete implementation are therefore freed when this classmethod
-        returns.
-
-        No instance _must_ directly be created outside of this class method.
-        """
-        # Use a generic implementation that handles most scipy
-        # metrics by computing the distances between 2 vectors at a time.
-        pda = ArgKminClassMode32(
-            datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs),
-            k=k,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            weights=weights,
-            labels=labels,
-            unique_labels=unique_labels,
-        )
-
-        # Limit the number of threads in second level of nested parallelism for BLAS
-        # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
-            if pda.execute_in_parallel_on_Y:
-                pda._parallel_on_Y()
-            else:
-                pda._parallel_on_X()
-
-        return pda._finalize_results()
-
-    def __init__(
-        self,
-        DatasetsPair32 datasets_pair,
-        const ITYPE_t[:] labels,
-        const ITYPE_t[:] unique_labels,
-        chunk_size=None,
-        strategy=None,
-        ITYPE_t k=1,
-        weights=None,
-    ):
-        super().__init__(
-            datasets_pair=datasets_pair,
-            chunk_size=chunk_size,
-            strategy=strategy,
-            k=k,
-        )
-
-        if weights == "uniform":
-            self.weight_type = WeightingStrategy.uniform
-        elif weights == "distance":
-            self.weight_type = WeightingStrategy.distance
-        else:
-            self.weight_type = WeightingStrategy.callable
-        self.labels = labels
-
-        self.unique_labels = unique_labels
-
-        cdef ITYPE_t idx, label
-        # Map from set of unique labels to their indices in `label_weights`
-        # Buffer used in building a histogram for one-pass weighted mode
-        self.label_weights = np.zeros(
-            (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE,
-        )
-
-    def _finalize_results(self):
-        probabilities = np.asarray(self.label_weights)
-        probabilities /= probabilities.sum(axis=1, keepdims=True)
-        return probabilities
-
-    cdef inline void weighted_histogram_mode(
-        self,
-        ITYPE_t sample_index,
-        ITYPE_t* indices,
-        DTYPE_t* distances,
-   ) nogil:
-        cdef:
-            ITYPE_t y_idx, label, label_index, multi_output_index
-            DTYPE_t label_weight = 1
-            # TODO: Implement other WeightingStrategy values
-            bint use_distance_weighting = (
-                self.weight_type == WeightingStrategy.distance
-            )
-
-        # Iterate through the sample k-nearest neighbours
-        for jdx in range(self.k):
-            # Absolute indice of the jdx-th Nearest Neighbors
-            # in range [0, n_samples_Y)
-            # TODO: inspect if it worth permuting this condition
-            # and the for-loop above for improved branching.
-            if use_distance_weighting:
-                label_weight = 1 / distances[jdx]
-            y_idx = indices[jdx]
-            label = self.labels[y_idx]
-            self.label_weights[sample_index][label] += label_weight
-        return
-
-    cdef void _parallel_on_X_prange_iter_finalize(
-        self,
-        ITYPE_t thread_num,
-        ITYPE_t X_start,
-        ITYPE_t X_end,
-    ) nogil:
-        cdef:
-            ITYPE_t idx, sample_index
-        # Sorting the main heaps portion associated to `X[X_start:X_end]`
-        # in ascending order w.r.t the distances.
-        for idx in range(X_end - X_start):
-            simultaneous_sort(
-                self.heaps_r_distances_chunks[thread_num] + idx * self.k,
-                self.heaps_indices_chunks[thread_num] + idx * self.k,
-                self.k
-            )
-            # One-pass top-one weighted mode
-            # Compute the absolute index in [0, n_samples_X)
-            sample_index = X_start + idx
-            self.weighted_histogram_mode(
-                sample_index,
-                &self.heaps_indices_chunks[thread_num][0],
-                &self.heaps_r_distances_chunks[thread_num][0],
-            )
-        return
-
-    cdef void _parallel_on_Y_finalize(
-        self,
-    ) nogil:
-        cdef:
-            ITYPE_t sample_index, thread_num
-
-        with nogil, parallel(num_threads=self.chunks_n_threads):
-            # Deallocating temporary datastructures
-            for thread_num in prange(self.chunks_n_threads, schedule='static'):
-                free(self.heaps_r_distances_chunks[thread_num])
-                free(self.heaps_indices_chunks[thread_num])
-
-            # Sorting the main in ascending order w.r.t the distances.
-            # This is done in parallel sample-wise (no need for locks).
-            for sample_index in prange(self.n_samples_X, schedule='static'):
-                simultaneous_sort(
-                    &self.argkmin_distances[sample_index, 0],
-                    &self.argkmin_indices[sample_index, 0],
-                    self.k,
-                )
-                self.weighted_histogram_mode(
-                    sample_index,
-                    &self.argkmin_indices[sample_index][0],
-                    &self.argkmin_distances[sample_index][0],
-                )
-        return