scikit-learn · thomasjpfan · Feb 9, 2023 · Oct 18, 2022 · Oct 18, 2022 · Oct 18, 2022
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -32,7 +32,8 @@
 
 FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics
 
-# Encodings are arbitray, but chosen as extensions to the -1 noise label.
+# Encodings are arbitrary but must be strictly negative.
+# The current encodings are chosen as extensions to the -1 noise label.
 # Avoided enums so that the end user only deals with simple labels.
 _OUTLIER_ENCODING = {
     "infinite": {
@@ -368,7 +369,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         Outliers are labeled as follows:
         - Noisy samples are given the label -1.
         - Samples with infinite elements (+/- np.inf) are given the label -2.
-        - Samples with missing data are given the label -3.
+        - Samples with missing data are given the label -3, even if they
+          also have infinite elements.
 
     probabilities_ : ndarray of shape (n_samples,)
         The strength with which each sample is a member of its assigned
@@ -670,7 +672,8 @@ def fit(self, X, y=None):
             self._single_linkage_tree_ = remap_single_linkage_tree(
                 self._single_linkage_tree_,
                 internal_to_raw,
-                non_finite=infinite_index + missing_index,
+                # There may be overlap for points w/ both `np.inf` and `np.nan`
+                non_finite=set(infinite_index + missing_index),
             )
             new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32)
             new_labels[finite_index] = self.labels_
@@ -771,12 +774,24 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
         Returns
         -------
         labels : ndarray of shape (n_samples,)
-            An array of cluster labels, one per datapoint. Unclustered points
-            are assigned the label -1.
+            An array of cluster labels, one per datapoint.
+            Outliers are labeled as follows:
+            - Noisy samples are given the label -1.
+            - Samples with infinite elements (+/- np.inf) are given the label -2.
+            - Samples with missing data are given the label -3, even if they
+              also have infinite elements.
         """
-        return labelling_at_cut(
+        labels = labelling_at_cut(
             self._single_linkage_tree_, cut_distance, min_cluster_size
         )
+        # Infer indices from labels generated during `fit`
+        infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"]
+        missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"]
+
+        # Overwrite infinite/missing outlier samples (otherwise simple noise)
+        labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
+        labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
+        return labels
 
     def _more_tags(self):
         return {"allow_nan": self.metric != "precomputed"}
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
@@ -143,13 +143,40 @@ def test_hdbscan_algorithms(algo, metric):
         hdb.fit(X)
 
 
-def test_hdbscan_dbscan_clustering():
+def test_dbscan_clustering():
     clusterer = HDBSCAN().fit(X)
     labels = clusterer.dbscan_clustering(0.3)
     n_clusters = len(set(labels) - OUTLIER_SET)
     assert n_clusters == n_clusters_true
 
 
+@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
+def test_dbscan_clustering_outlier_data(cut_distance):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    missing_label = _OUTLIER_ENCODING["missing"]["label"]
+    infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [np.inf, 1]
+    X_outlier[2] = [1, np.nan]
+    X_outlier[5] = [np.inf, np.nan]
+    model = HDBSCAN().fit(X_outlier)
+    labels = model.dbscan_clustering(cut_distance=cut_distance)
+
+    missing_labels_idx = np.flatnonzero(labels == missing_label)
+    assert_array_equal(missing_labels_idx, [2, 5])
+
+    infinite_labels_idx = np.flatnonzero(labels == infinite_label)
+    assert_array_equal(infinite_labels_idx, [0])
+
+    clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
+    clean_model = HDBSCAN().fit(X_outlier[clean_idx])
+    clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
+    assert_array_equal(clean_labels, labels[clean_idx])
+
+
 def test_hdbscan_high_dimensional():
     H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
     H = StandardScaler().fit_transform(H)