From 1d63c0e57b90ff9a24104a70bfded96b1d8d5c93 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 18 Oct 2022 17:19:40 -0400 Subject: [PATCH 1/8] Extended outlier encoding to dbscan_clustering method --- sklearn/cluster/_hdbscan/hdbscan.py | 8 +++++++- sklearn/cluster/tests/test_hdbscan.py | 27 ++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..04844de4708fd 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -767,9 +767,15 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. """ - return labelling_at_cut( + labels = labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size ) + infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"] + missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"] + + labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] + labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] + return labels def _more_tags(self): return {"allow_nan": self.metric != "precomputed"} diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 65f9829ffef5c..48ce60dc05a0d 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -143,12 +143,37 @@ def test_hdbscan_algorithms(algo, metric): hdb.fit(X) -def test_hdbscan_dbscan_clustering(): +def test_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true +@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) +@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1)) +def test_dbscan_clustering_outlier_data(outlier_type, cut_distance): + """ + Tests if np.inf and np.nan data are each treated as special outliers. + """ + outlier = { + "infinite": np.inf, + "missing": np.nan, + }[outlier_type] + label = _OUTLIER_ENCODING[outlier_type]["label"] + + X_outlier = X.copy() + X_outlier[0] = [outlier, 1] + X_outlier[5] = [outlier, outlier] + model = HDBSCAN().fit(X_outlier) + labels = model.dbscan_clustering(cut_distance=cut_distance) + + (missing_labels_idx,) = (labels == label).nonzero() + assert_array_equal(missing_labels_idx, [0, 5]) + + clean_indices = list(range(1, 5)) + list(range(6, 200)) + clean_model = HDBSCAN().fit(X_outlier[clean_indices]) + clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) + assert_array_equal(clean_labels, labels[clean_indices]) def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) From edfa11d51c6024ddb6132b9d135522477c87204d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 18 Oct 2022 17:49:54 -0400 Subject: [PATCH 2/8] Improved documentation --- sklearn/cluster/_hdbscan/hdbscan.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 04844de4708fd..a7487db970412 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -27,7 +27,8 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics -# Encodings are arbitray, but chosen as extensions to the -1 noise label. +# Encodings are arbitray but must be strictly negative. +# The current encodings are chosen as extensions to the -1 noise label. # Avoided enums so that the end user only deals with simple labels. _OUTLIER_ENCODING = { "infinite": { @@ -764,15 +765,20 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Returns ------- labels : ndarray of shape (n_samples,) - An array of cluster labels, one per datapoint. Unclustered points - are assigned the label -1. + An array of cluster labels, one per datapoint. + Outliers are labeled as follows: + - Noisy samples are given the label -1. + - Samples with infinite elements (+/- np.inf) are given the label -2. + - Samples with missing data are given the label -3. """ labels = labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size ) + # Infer indices from labels generated during `fit` infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"] missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"] + # Overwrite infinite/missing outlier samples (otherwise simple noise) labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] return labels From 31e4ac290ccb48437b802fa56ba17f54dd5e6fe8 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 18 Oct 2022 18:56:09 -0400 Subject: [PATCH 3/8] Black --- sklearn/cluster/_hdbscan/hdbscan.py | 4 ++-- sklearn/cluster/tests/test_hdbscan.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index a7487db970412..4a186efa4d3a6 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -765,11 +765,11 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Returns ------- labels : ndarray of shape (n_samples,) - An array of cluster labels, one per datapoint. + An array of cluster labels, one per datapoint. Outliers are labeled as follows: - Noisy samples are given the label -1. - Samples with infinite elements (+/- np.inf) are given the label -2. - - Samples with missing data are given the label -3. + - Samples with missing data are given the label -3. """ labels = labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 48ce60dc05a0d..dc1fd5248201f 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -149,6 +149,7 @@ def test_dbscan_clustering(): n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true + @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) @pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1)) def test_dbscan_clustering_outlier_data(outlier_type, cut_distance): @@ -175,6 +176,7 @@ def test_dbscan_clustering_outlier_data(outlier_type, cut_distance): clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) assert_array_equal(clean_labels, labels[clean_indices]) + def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) H = StandardScaler().fit_transform(H) From 10355aa8c1c054aba79bdf804e067053a0ac6c56 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 5 Nov 2022 14:51:37 -0400 Subject: [PATCH 4/8] Updated test and clarified condition with both missing/inf data --- sklearn/cluster/_hdbscan/hdbscan.py | 9 ++++++--- sklearn/cluster/tests/test_hdbscan.py | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 4a186efa4d3a6..579d5ea89824e 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -362,7 +362,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Outliers are labeled as follows: - Noisy samples are given the label -1. - Samples with infinite elements (+/- np.inf) are given the label -2. - - Samples with missing data are given the label -3. + - Samples with missing data are given the label -3, even if they + also have infinite elements. probabilities_ : ndarray of shape (n_samples,) The strength with which each sample is a member of its assigned @@ -664,7 +665,8 @@ def fit(self, X, y=None): self._single_linkage_tree_ = remap_single_linkage_tree( self._single_linkage_tree_, internal_to_raw, - non_finite=infinite_index + missing_index, + # There may be overlap for points w/ both `np.inf` and `np.nan` + non_finite=set(infinite_index + missing_index), ) new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32) new_labels[finite_index] = self.labels_ @@ -769,7 +771,8 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Outliers are labeled as follows: - Noisy samples are given the label -1. - Samples with infinite elements (+/- np.inf) are given the label -2. - - Samples with missing data are given the label -3. + - Samples with missing data are given the label -3, even if they + also have infinite elements. """ labels = labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index dc1fd5248201f..7949c3686c787 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -150,26 +150,26 @@ def test_dbscan_clustering(): assert n_clusters == n_clusters_true -@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) @pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1)) -def test_dbscan_clustering_outlier_data(outlier_type, cut_distance): +def test_dbscan_clustering_outlier_data(cut_distance): """ Tests if np.inf and np.nan data are each treated as special outliers. """ - outlier = { - "infinite": np.inf, - "missing": np.nan, - }[outlier_type] - label = _OUTLIER_ENCODING[outlier_type]["label"] + missing_label = _OUTLIER_ENCODING["missing"]["label"] + infinite_label = _OUTLIER_ENCODING["infinite"]["label"] X_outlier = X.copy() - X_outlier[0] = [outlier, 1] - X_outlier[5] = [outlier, outlier] + X_outlier[0] = [np.inf, 1] + X_outlier[2] = [1, np.nan] + X_outlier[5] = [np.inf, np.nan] model = HDBSCAN().fit(X_outlier) labels = model.dbscan_clustering(cut_distance=cut_distance) - (missing_labels_idx,) = (labels == label).nonzero() - assert_array_equal(missing_labels_idx, [0, 5]) + (missing_labels_idx,) = (labels == missing_label).nonzero() + assert_array_equal(missing_labels_idx, [2, 5]) + + (infinite_labels_idx,) = (labels == infinite_label).nonzero() + assert_array_equal(infinite_labels_idx, [0]) clean_indices = list(range(1, 5)) + list(range(6, 200)) clean_model = HDBSCAN().fit(X_outlier[clean_indices]) From 731612d0ad039f03821e8d80b8f58229acf0fc11 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 8 Nov 2022 12:12:20 -0500 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/cluster/tests/test_hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 7949c3686c787..dc37e8d5ef8f2 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -165,10 +165,10 @@ def test_dbscan_clustering_outlier_data(cut_distance): model = HDBSCAN().fit(X_outlier) labels = model.dbscan_clustering(cut_distance=cut_distance) - (missing_labels_idx,) = (labels == missing_label).nonzero() + missing_labels_idx = np.flatnonzero(labels == missing_label) assert_array_equal(missing_labels_idx, [2, 5]) - (infinite_labels_idx,) = (labels == infinite_label).nonzero() + infinite_labels_idx = np.flatnonzero(labels == infinite_label) assert_array_equal(infinite_labels_idx, [0]) clean_indices = list(range(1, 5)) + list(range(6, 200)) From ebb72b407c4d6f8f3be8aeb98e938be52459790a Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 7 Feb 2023 11:18:50 -0500 Subject: [PATCH 6/8] Update sklearn/cluster/_hdbscan/hdbscan.py Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 38e803313fc3b..ae3645c88dccb 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -32,7 +32,7 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics -# Encodings are arbitray but must be strictly negative. +# Encodings are arbitrary but must be strictly negative. # The current encodings are chosen as extensions to the -1 noise label. # Avoided enums so that the end user only deals with simple labels. _OUTLIER_ENCODING = { From 3db72616cdf6d607ef4b3f11a8209c7dcf43b85c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 8 Feb 2023 18:33:45 -0500 Subject: [PATCH 7/8] Corrected/improved test --- sklearn/cluster/tests/test_hdbscan.py | 6 +- .../_argkminclassmode.pyx | 385 ++++++++++++++++++ 2 files changed, 388 insertions(+), 3 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index dc37e8d5ef8f2..8181c0d2cb218 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -171,10 +171,10 @@ def test_dbscan_clustering_outlier_data(cut_distance): infinite_labels_idx = np.flatnonzero(labels == infinite_label) assert_array_equal(infinite_labels_idx, [0]) - clean_indices = list(range(1, 5)) + list(range(6, 200)) - clean_model = HDBSCAN().fit(X_outlier[clean_indices]) + clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx)) + clean_model = HDBSCAN().fit(X_outlier[clean_idx]) clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) - assert_array_equal(clean_labels, labels[clean_indices]) + assert_array_equal(clean_labels, labels[clean_idx]) def test_hdbscan_high_dimensional(): diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx new file mode 100644 index 0000000000000..102f7c99059b5 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx @@ -0,0 +1,385 @@ + +from cython cimport floating, integral +from cython.parallel cimport parallel, prange +from libcpp.map cimport map as cpp_map, pair as cpp_pair +from libc.stdlib cimport free + +cimport numpy as cnp + +cnp.import_array() + +from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from ...utils._typedefs import ITYPE, DTYPE +from ...utils._sorting cimport simultaneous_sort +import numpy as np +from scipy.sparse import issparse +from sklearn.utils.fixes import threadpool_limits + +cpdef enum WeightingStrategy: + uniform = 0 + # TODO: Implement the following options, most likely in + # `weighted_histogram_mode` + distance = 1 + callable = 2 +from ._argkmin cimport ArgKmin64 +from ._datasets_pair cimport DatasetsPair64 + +cdef class ArgKminClassMode64(ArgKmin64): + """ + 64bit implementation of ArgKminClassMode. + """ + cdef: + const ITYPE_t[:] labels, + const ITYPE_t[:] unique_labels + DTYPE_t[:, :] label_weights + cpp_map[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + weights, + labels, + unique_labels, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the argkmin reduction with labels. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKminClassMode64`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance _must_ directly be created outside of this class method. + """ + # Use a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKminClassMode64( + datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + labels=labels, + unique_labels=unique_labels, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair64 datasets_pair, + const ITYPE_t[:] labels, + const ITYPE_t[:] unique_labels, + chunk_size=None, + strategy=None, + ITYPE_t k=1, + weights=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.callable + self.labels = labels + + self.unique_labels = unique_labels + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros( + (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE, + ) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + # TODO: Implement other WeightingStrategy values + bint use_distance_weighting = ( + self.weight_type == WeightingStrategy.distance + ) + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + # TODO: inspect if it worth permuting this condition + # and the for-loop above for improved branching. + if use_distance_weighting: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + self.label_weights[sample_index][label] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return +from ._argkmin cimport ArgKmin32 +from ._datasets_pair cimport DatasetsPair32 + +cdef class ArgKminClassMode32(ArgKmin32): + """ + 32bit implementation of ArgKminClassMode. + """ + cdef: + const ITYPE_t[:] labels, + const ITYPE_t[:] unique_labels + DTYPE_t[:, :] label_weights + cpp_map[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + weights, + labels, + unique_labels, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the argkmin reduction with labels. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKminClassMode32`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance _must_ directly be created outside of this class method. + """ + # Use a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKminClassMode32( + datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + labels=labels, + unique_labels=unique_labels, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair32 datasets_pair, + const ITYPE_t[:] labels, + const ITYPE_t[:] unique_labels, + chunk_size=None, + strategy=None, + ITYPE_t k=1, + weights=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.callable + self.labels = labels + + self.unique_labels = unique_labels + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros( + (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE, + ) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + # TODO: Implement other WeightingStrategy values + bint use_distance_weighting = ( + self.weight_type == WeightingStrategy.distance + ) + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + # TODO: inspect if it worth permuting this condition + # and the for-loop above for improved branching. + if use_distance_weighting: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + self.label_weights[sample_index][label] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return From eb263f0c8092ea1e07a45c645adc5050bb358ded Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 8 Feb 2023 18:34:52 -0500 Subject: [PATCH 8/8] Removed accidental inclusion --- .../_argkminclassmode.pyx | 385 ------------------ 1 file changed, 385 deletions(-) delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx deleted file mode 100644 index 102f7c99059b5..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkminclassmode.pyx +++ /dev/null @@ -1,385 +0,0 @@ - -from cython cimport floating, integral -from cython.parallel cimport parallel, prange -from libcpp.map cimport map as cpp_map, pair as cpp_pair -from libc.stdlib cimport free - -cimport numpy as cnp - -cnp.import_array() - -from ...utils._typedefs cimport ITYPE_t, DTYPE_t -from ...utils._typedefs import ITYPE, DTYPE -from ...utils._sorting cimport simultaneous_sort -import numpy as np -from scipy.sparse import issparse -from sklearn.utils.fixes import threadpool_limits - -cpdef enum WeightingStrategy: - uniform = 0 - # TODO: Implement the following options, most likely in - # `weighted_histogram_mode` - distance = 1 - callable = 2 -from ._argkmin cimport ArgKmin64 -from ._datasets_pair cimport DatasetsPair64 - -cdef class ArgKminClassMode64(ArgKmin64): - """ - 64bit implementation of ArgKminClassMode. - """ - cdef: - const ITYPE_t[:] labels, - const ITYPE_t[:] unique_labels - DTYPE_t[:, :] label_weights - cpp_map[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - @classmethod - def compute( - cls, - X, - Y, - ITYPE_t k, - weights, - labels, - unique_labels, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - ): - """Compute the argkmin reduction with labels. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKminClassMode64`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance _must_ directly be created outside of this class method. - """ - # Use a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = ArgKminClassMode64( - datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs), - k=k, - chunk_size=chunk_size, - strategy=strategy, - weights=weights, - labels=labels, - unique_labels=unique_labels, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results() - - def __init__( - self, - DatasetsPair64 datasets_pair, - const ITYPE_t[:] labels, - const ITYPE_t[:] unique_labels, - chunk_size=None, - strategy=None, - ITYPE_t k=1, - weights=None, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - k=k, - ) - - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.callable - self.labels = labels - - self.unique_labels = unique_labels - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros( - (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE, - ) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - # TODO: Implement other WeightingStrategy values - bint use_distance_weighting = ( - self.weight_type == WeightingStrategy.distance - ) - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - # TODO: inspect if it worth permuting this condition - # and the for-loop above for improved branching. - if use_distance_weighting: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - self.label_weights[sample_index][label] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return -from ._argkmin cimport ArgKmin32 -from ._datasets_pair cimport DatasetsPair32 - -cdef class ArgKminClassMode32(ArgKmin32): - """ - 32bit implementation of ArgKminClassMode. - """ - cdef: - const ITYPE_t[:] labels, - const ITYPE_t[:] unique_labels - DTYPE_t[:, :] label_weights - cpp_map[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - @classmethod - def compute( - cls, - X, - Y, - ITYPE_t k, - weights, - labels, - unique_labels, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - ): - """Compute the argkmin reduction with labels. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKminClassMode32`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance _must_ directly be created outside of this class method. - """ - # Use a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = ArgKminClassMode32( - datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs), - k=k, - chunk_size=chunk_size, - strategy=strategy, - weights=weights, - labels=labels, - unique_labels=unique_labels, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results() - - def __init__( - self, - DatasetsPair32 datasets_pair, - const ITYPE_t[:] labels, - const ITYPE_t[:] unique_labels, - chunk_size=None, - strategy=None, - ITYPE_t k=1, - weights=None, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - k=k, - ) - - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.callable - self.labels = labels - - self.unique_labels = unique_labels - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros( - (self.n_samples_X, unique_labels.shape[0]), dtype=DTYPE, - ) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - # TODO: Implement other WeightingStrategy values - bint use_distance_weighting = ( - self.weight_type == WeightingStrategy.distance - ) - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - # TODO: inspect if it worth permuting this condition - # and the for-loop above for improved branching. - if use_distance_weighting: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - self.label_weights[sample_index][label] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return