diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index c0c281ce31475..8f4629f573842 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -287,22 +287,37 @@ def test_hdbscan_precomputed_non_brute(tree): def test_hdbscan_sparse(): """ Tests that HDBSCAN works correctly when passing sparse feature data. + Evaluates correctness by comparing against the same data passed as a dense + array. """ - sparse_X = sparse.csr_matrix(X) - labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) + dense_labels = HDBSCAN().fit(X).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 - sparse_X_nan = sparse_X.copy() - sparse_X_nan[0, 0] = np.nan - labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) - assert n_clusters == 3 + _X_sparse = sparse.csr_matrix(X) + X_sparse = _X_sparse.copy() + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) + + # Compare that the sparse and dense non-precomputed routines return the same labels + # where the 0th observation contains the outlier. + for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): + X_dense = X.copy() + X_dense[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(X_dense).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) + assert n_clusters == 3 + assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + + X_sparse = _X_sparse.copy() + X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="balltree").fit(X_sparse) @pytest.mark.parametrize("algorithm", ALGORITHMS)