diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 4f1fcf1962d0b..ae3645c88dccb 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -32,7 +32,8 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics -# Encodings are arbitray, but chosen as extensions to the -1 noise label. +# Encodings are arbitrary but must be strictly negative. +# The current encodings are chosen as extensions to the -1 noise label. # Avoided enums so that the end user only deals with simple labels. _OUTLIER_ENCODING = { "infinite": { @@ -368,7 +369,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Outliers are labeled as follows: - Noisy samples are given the label -1. - Samples with infinite elements (+/- np.inf) are given the label -2. - - Samples with missing data are given the label -3. + - Samples with missing data are given the label -3, even if they + also have infinite elements. probabilities_ : ndarray of shape (n_samples,) The strength with which each sample is a member of its assigned @@ -670,7 +672,8 @@ def fit(self, X, y=None): self._single_linkage_tree_ = remap_single_linkage_tree( self._single_linkage_tree_, internal_to_raw, - non_finite=infinite_index + missing_index, + # There may be overlap for points w/ both `np.inf` and `np.nan` + non_finite=set(infinite_index + missing_index), ) new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32) new_labels[finite_index] = self.labels_ @@ -771,12 +774,24 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Returns ------- labels : ndarray of shape (n_samples,) - An array of cluster labels, one per datapoint. Unclustered points - are assigned the label -1. + An array of cluster labels, one per datapoint. + Outliers are labeled as follows: + - Noisy samples are given the label -1. + - Samples with infinite elements (+/- np.inf) are given the label -2. + - Samples with missing data are given the label -3, even if they + also have infinite elements. """ - return labelling_at_cut( + labels = labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size ) + # Infer indices from labels generated during `fit` + infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"] + missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"] + + # Overwrite infinite/missing outlier samples (otherwise simple noise) + labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] + labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] + return labels def _more_tags(self): return {"allow_nan": self.metric != "precomputed"} diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 65f9829ffef5c..8181c0d2cb218 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -143,13 +143,40 @@ def test_hdbscan_algorithms(algo, metric): hdb.fit(X) -def test_hdbscan_dbscan_clustering(): +def test_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true +@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1)) +def test_dbscan_clustering_outlier_data(cut_distance): + """ + Tests if np.inf and np.nan data are each treated as special outliers. + """ + missing_label = _OUTLIER_ENCODING["missing"]["label"] + infinite_label = _OUTLIER_ENCODING["infinite"]["label"] + + X_outlier = X.copy() + X_outlier[0] = [np.inf, 1] + X_outlier[2] = [1, np.nan] + X_outlier[5] = [np.inf, np.nan] + model = HDBSCAN().fit(X_outlier) + labels = model.dbscan_clustering(cut_distance=cut_distance) + + missing_labels_idx = np.flatnonzero(labels == missing_label) + assert_array_equal(missing_labels_idx, [2, 5]) + + infinite_labels_idx = np.flatnonzero(labels == infinite_label) + assert_array_equal(infinite_labels_idx, [0]) + + clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx)) + clean_model = HDBSCAN().fit(X_outlier[clean_idx]) + clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) + assert_array_equal(clean_labels, labels[clean_idx]) + + def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) H = StandardScaler().fit_transform(H)