Thanks to visit codestin.com
Credit goes to github.com

Skip to content

ENH Extends outlier encoding scheme to HDBSCAN.dbscan_clustering #24698

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 9, 2023
27 changes: 21 additions & 6 deletions sklearn/cluster/_hdbscan/hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@

FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics

# Encodings are arbitray, but chosen as extensions to the -1 noise label.
# Encodings are arbitrary but must be strictly negative.
# The current encodings are chosen as extensions to the -1 noise label.
# Avoided enums so that the end user only deals with simple labels.
_OUTLIER_ENCODING = {
"infinite": {
Expand Down Expand Up @@ -368,7 +369,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
Outliers are labeled as follows:
- Noisy samples are given the label -1.
- Samples with infinite elements (+/- np.inf) are given the label -2.
- Samples with missing data are given the label -3.
- Samples with missing data are given the label -3, even if they
also have infinite elements.

probabilities_ : ndarray of shape (n_samples,)
The strength with which each sample is a member of its assigned
Expand Down Expand Up @@ -670,7 +672,8 @@ def fit(self, X, y=None):
self._single_linkage_tree_ = remap_single_linkage_tree(
self._single_linkage_tree_,
internal_to_raw,
non_finite=infinite_index + missing_index,
# There may be overlap for points w/ both `np.inf` and `np.nan`
non_finite=set(infinite_index + missing_index),
)
new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32)
new_labels[finite_index] = self.labels_
Expand Down Expand Up @@ -771,12 +774,24 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5):
Returns
-------
labels : ndarray of shape (n_samples,)
An array of cluster labels, one per datapoint. Unclustered points
are assigned the label -1.
An array of cluster labels, one per datapoint.
Outliers are labeled as follows:
- Noisy samples are given the label -1.
- Samples with infinite elements (+/- np.inf) are given the label -2.
- Samples with missing data are given the label -3, even if they
also have infinite elements.
"""
return labelling_at_cut(
labels = labelling_at_cut(
self._single_linkage_tree_, cut_distance, min_cluster_size
)
# Infer indices from labels generated during `fit`
infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"]
missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"]

# Overwrite infinite/missing outlier samples (otherwise simple noise)
labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"]
labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"]
return labels

def _more_tags(self):
return {"allow_nan": self.metric != "precomputed"}
29 changes: 28 additions & 1 deletion sklearn/cluster/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,40 @@ def test_hdbscan_algorithms(algo, metric):
hdb.fit(X)


def test_hdbscan_dbscan_clustering():
def test_dbscan_clustering():
clusterer = HDBSCAN().fit(X)
labels = clusterer.dbscan_clustering(0.3)
n_clusters = len(set(labels) - OUTLIER_SET)
assert n_clusters == n_clusters_true


@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
def test_dbscan_clustering_outlier_data(cut_distance):
"""
Tests if np.inf and np.nan data are each treated as special outliers.
"""
missing_label = _OUTLIER_ENCODING["missing"]["label"]
infinite_label = _OUTLIER_ENCODING["infinite"]["label"]

X_outlier = X.copy()
X_outlier[0] = [np.inf, 1]
X_outlier[2] = [1, np.nan]
X_outlier[5] = [np.inf, np.nan]
model = HDBSCAN().fit(X_outlier)
labels = model.dbscan_clustering(cut_distance=cut_distance)

missing_labels_idx = np.flatnonzero(labels == missing_label)
assert_array_equal(missing_labels_idx, [2, 5])

infinite_labels_idx = np.flatnonzero(labels == infinite_label)
assert_array_equal(infinite_labels_idx, [0])

clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
clean_model = HDBSCAN().fit(X_outlier[clean_idx])
clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
assert_array_equal(clean_labels, labels[clean_idx])


def test_hdbscan_high_dimensional():
H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
H = StandardScaler().fit_transform(H)
Expand Down