33# License: BSD 3 clause
44import numpy as np
55import pytest
6+ from scipy import sparse
67import warnings
78
89from sklearn .datasets import make_blobs
1516from sklearn .utils import shuffle
1617from sklearn .utils ._testing import assert_array_equal
1718from sklearn .utils ._testing import assert_allclose
18-
19+ from sklearn . exceptions import EfficiencyWarning
1920from sklearn .cluster .tests .common import generate_clustered_data
2021
2122
@@ -158,15 +159,19 @@ def test_cluster_hierarchy_(global_dtype):
158159 assert diff / len (X ) < 0.05
159160
160161
161- def test_correct_number_of_clusters ():
162+ @pytest .mark .parametrize (
163+ "metric, is_sparse" ,
164+ [["minkowski" , False ], ["euclidean" , True ]],
165+ )
166+ def test_correct_number_of_clusters (metric , is_sparse ):
162167 # in 'auto' mode
163168
164169 n_clusters = 3
165170 X = generate_clustered_data (n_clusters = n_clusters )
166171 # Parameters chosen specifically for this task.
167172 # Compute OPTICS
168- clust = OPTICS (max_eps = 5.0 * 6.0 , min_samples = 4 , xi = 0.1 )
169- clust .fit (X )
173+ clust = OPTICS (max_eps = 5.0 * 6.0 , min_samples = 4 , xi = 0.1 , metric = metric )
174+ clust .fit (sparse . csr_matrix ( X ) if is_sparse else X )
170175 # number of clusters, ignoring noise if present
171176 n_clusters_1 = len (set (clust .labels_ )) - int (- 1 in clust .labels_ )
172177 assert n_clusters_1 == n_clusters
@@ -286,18 +291,25 @@ def test_close_extract():
286291
287292@pytest .mark .parametrize ("eps" , [0.1 , 0.3 , 0.5 ])
288293@pytest .mark .parametrize ("min_samples" , [3 , 10 , 20 ])
289- def test_dbscan_optics_parity (eps , min_samples , global_dtype ):
294+ @pytest .mark .parametrize (
295+ "metric, is_sparse" ,
296+ [["minkowski" , False ], ["euclidean" , False ], ["euclidean" , True ]],
297+ )
298+ def test_dbscan_optics_parity (eps , min_samples , metric , is_sparse , global_dtype ):
290299 # Test that OPTICS clustering labels are <= 5% difference of DBSCAN
291300
292301 centers = [[1 , 1 ], [- 1 , - 1 ], [1 , - 1 ]]
293302 X , labels_true = make_blobs (
294303 n_samples = 750 , centers = centers , cluster_std = 0.4 , random_state = 0
295304 )
305+ X = sparse .csr_matrix (X ) if is_sparse else X
296306
297307 X = X .astype (global_dtype , copy = False )
298308
299309 # calculate optics with dbscan extract at 0.3 epsilon
300- op = OPTICS (min_samples = min_samples , cluster_method = "dbscan" , eps = eps ).fit (X )
310+ op = OPTICS (
311+ min_samples = min_samples , cluster_method = "dbscan" , eps = eps , metric = metric
312+ ).fit (X )
301313
302314 # calculate dbscan labels
303315 db = DBSCAN (eps = eps , min_samples = min_samples ).fit (X )
@@ -344,7 +356,8 @@ def test_min_cluster_size(min_cluster_size, global_dtype):
344356 assert min (cluster_sizes ) >= min_cluster_size
345357 # check behaviour is the same when min_cluster_size is a fraction
346358 clust_frac = OPTICS (
347- min_samples = 9 , min_cluster_size = min_cluster_size / redX .shape [0 ]
359+ min_samples = 9 ,
360+ min_cluster_size = min_cluster_size / redX .shape [0 ],
348361 )
349362 clust_frac .fit (redX )
350363 assert_array_equal (clust .labels_ , clust_frac .labels_ )
@@ -356,17 +369,26 @@ def test_min_cluster_size_invalid(min_cluster_size):
356369 with pytest .raises (ValueError , match = "must be a positive integer or a " ):
357370 clust .fit (X )
358371
372+ clust = OPTICS (min_cluster_size = min_cluster_size , metric = "euclidean" )
373+ with pytest .raises (ValueError , match = "must be a positive integer or a " ):
374+ clust .fit (sparse .csr_matrix (X ))
375+
359376
360377def test_min_cluster_size_invalid2 ():
361378 clust = OPTICS (min_cluster_size = len (X ) + 1 )
362379 with pytest .raises (ValueError , match = "must be no greater than the " ):
363380 clust .fit (X )
364381
382+ clust = OPTICS (min_cluster_size = len (X ) + 1 , metric = "euclidean" )
383+ with pytest .raises (ValueError , match = "must be no greater than the " ):
384+ clust .fit (sparse .csr_matrix (X ))
385+
365386
366387def test_processing_order ():
367388 # Ensure that we consider all unprocessed points,
368389 # not only direct neighbors. when picking the next point.
369390 Y = [[0 ], [10 ], [- 10 ], [25 ]]
391+
370392 clust = OPTICS (min_samples = 3 , max_eps = 15 ).fit (Y )
371393 assert_array_equal (clust .reachability_ , [np .inf , 10 , 10 , 15 ])
372394 assert_array_equal (clust .core_distances_ , [10 , 15 , np .inf , np .inf ])
@@ -796,10 +818,16 @@ def test_extract_dbscan(global_dtype):
796818 assert_array_equal (np .sort (np .unique (clust .labels_ )), [0 , 1 , 2 , 3 ])
797819
798820
799- def test_precomputed_dists (global_dtype ):
821+ @pytest .mark .parametrize ("is_sparse" , [False , True ])
822+ def test_precomputed_dists (is_sparse , global_dtype ):
800823 redX = X [::2 ].astype (global_dtype , copy = False )
801824 dists = pairwise_distances (redX , metric = "euclidean" )
802- clust1 = OPTICS (min_samples = 10 , algorithm = "brute" , metric = "precomputed" ).fit (dists )
825+ dists = sparse .csr_matrix (dists ) if is_sparse else dists
826+ with warnings .catch_warnings ():
827+ warnings .simplefilter ("ignore" , EfficiencyWarning )
828+ clust1 = OPTICS (min_samples = 10 , algorithm = "brute" , metric = "precomputed" ).fit (
829+ dists
830+ )
803831 clust2 = OPTICS (min_samples = 10 , algorithm = "brute" , metric = "euclidean" ).fit (redX )
804832
805833 assert_allclose (clust1 .reachability_ , clust2 .reachability_ )
0 commit comments