Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit eb08740

Browse files
StefanieSengerglemaitrejjerphan
authored
TST Extend tests for scipy.sparse.*array in sklearn/tests/test_random_projection.py (#27314)
Co-authored-by: Guillaume Lemaitre <[email protected]> Co-authored-by: Julien Jerphanion <[email protected]>
1 parent 8db3aac commit eb08740

File tree

2 files changed

+151
-33
lines changed

2 files changed

+151
-33
lines changed

doc/whats_new/v1.4.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ and classes are impacted:
169169
- :class:`feature_extraction.text.TfidfTransformer` in :pr:`27219` by
170170
:user:`Yao Xiao <Charlie-XIAO>`;
171171
- :class:`cluster.Isomap` in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
172-
- :func:`manifold.SpectralEmbedding` in :pr:`27240` by :user:`Yao Xiao <Charlie-XIAO>`;
172+
- :class:`manifold.SpectralEmbedding` in :pr:`27240` by :user:`Yao Xiao <Charlie-XIAO>`;
173173
- :class:`manifold.TSNE` in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
174174
- :class:`impute.SimpleImputer` in :pr:`27277` by :user:`Yao Xiao <Charlie-XIAO>`;
175175
- :class:`impute.IterativeImputer` in :pr:`27277` by :user:`Yao Xiao <Charlie-XIAO>`;
@@ -180,6 +180,10 @@ and classes are impacted:
180180
:user:`Yao Xiao <Charlie-XIAO>`;
181181
- :class:`preprocessing.PolynomialFeatures` in :pr:`27166` by
182182
:user:`Mohit Joshi <work-mohit>`.
183+
- :class:`random_projection.GaussianRandomProjection` in :pr:`27314` by
184+
:user:`Stefanie Senger <StefanieSenger>`.
185+
- :class:`random_projection.SparseRandomProjection`in :pr:`27314` by
186+
:user:`Stefanie Senger <StefanieSenger>`.
183187

184188
Changelog
185189
---------

sklearn/tests/test_random_projection.py

Lines changed: 146 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77
import scipy.sparse as sp
88

9-
from sklearn.exceptions import DataDimensionalityWarning
9+
from sklearn.exceptions import DataDimensionalityWarning, NotFittedError
1010
from sklearn.metrics import euclidean_distances
1111
from sklearn.random_projection import (
1212
GaussianRandomProjection,
@@ -22,6 +22,7 @@
2222
assert_array_almost_equal,
2323
assert_array_equal,
2424
)
25+
from sklearn.utils.fixes import COO_CONTAINERS
2526

2627
all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
2728
all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
@@ -32,11 +33,20 @@
3233
all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
3334

3435

35-
# Make some random data with uniformly located non zero entries with
36-
# Gaussian distributed values
37-
def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
36+
def make_sparse_random_data(
37+
coo_container,
38+
n_samples,
39+
n_features,
40+
n_nonzeros,
41+
random_state=None,
42+
sparse_format="csr",
43+
):
44+
"""Make some random data with uniformly located non zero entries with
45+
Gaussian distributed values; `sparse_format` can be `"csr"` (default) or
46+
`None` (in which case a dense array is returned).
47+
"""
3848
rng = np.random.RandomState(random_state)
39-
data_coo = sp.coo_matrix(
49+
data_coo = coo_container(
4050
(
4151
rng.randn(n_nonzeros),
4252
(
@@ -46,7 +56,10 @@ def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
4656
),
4757
shape=(n_samples, n_features),
4858
)
49-
return data_coo.toarray(), data_coo.tocsr()
59+
if sparse_format is not None:
60+
return data_coo.asformat(sparse_format)
61+
else:
62+
return data_coo.toarray()
5063

5164

5265
def densify(matrix):
@@ -58,7 +71,6 @@ def densify(matrix):
5871

5972
n_samples, n_features = (10, 1000)
6073
n_nonzeros = int(n_samples * n_features / 100.0)
61-
data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros)
6274

6375

6476
###############################################################################
@@ -221,14 +233,31 @@ def test_random_projection_transformer_invalid_input():
221233
RandomProjection(n_components=n_components).fit(fit_data)
222234

223235

224-
def test_try_to_transform_before_fit():
236+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
237+
def test_try_to_transform_before_fit(coo_container, global_random_seed):
238+
data = make_sparse_random_data(
239+
coo_container,
240+
n_samples,
241+
n_features,
242+
n_nonzeros,
243+
random_state=global_random_seed,
244+
sparse_format=None,
245+
)
225246
for RandomProjection in all_RandomProjection:
226-
with pytest.raises(ValueError):
247+
with pytest.raises(NotFittedError):
227248
RandomProjection(n_components="auto").transform(data)
228249

229250

230-
def test_too_many_samples_to_find_a_safe_embedding():
231-
data, _ = make_sparse_random_data(1000, 100, 1000)
251+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
252+
def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed):
253+
data = make_sparse_random_data(
254+
coo_container,
255+
n_samples=1000,
256+
n_features=100,
257+
n_nonzeros=1000,
258+
random_state=global_random_seed,
259+
sparse_format=None,
260+
)
232261

233262
for RandomProjection in all_RandomProjection:
234263
rp = RandomProjection(n_components="auto", eps=0.1)
@@ -241,8 +270,16 @@ def test_too_many_samples_to_find_a_safe_embedding():
241270
rp.fit(data)
242271

243272

244-
def test_random_projection_embedding_quality():
245-
data, _ = make_sparse_random_data(8, 5000, 15000)
273+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
274+
def test_random_projection_embedding_quality(coo_container):
275+
data = make_sparse_random_data(
276+
coo_container,
277+
n_samples=8,
278+
n_features=5000,
279+
n_nonzeros=15000,
280+
random_state=0,
281+
sparse_format=None,
282+
)
246283
eps = 0.2
247284

248285
original_distances = euclidean_distances(data, squared=True)
@@ -271,28 +308,54 @@ def test_random_projection_embedding_quality():
271308
assert 1 - eps < distances_ratio.min()
272309

273310

274-
def test_SparseRandomProj_output_representation():
311+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
312+
def test_SparseRandomProj_output_representation(coo_container):
313+
dense_data = make_sparse_random_data(
314+
coo_container,
315+
n_samples,
316+
n_features,
317+
n_nonzeros,
318+
random_state=0,
319+
sparse_format=None,
320+
)
321+
sparse_data = make_sparse_random_data(
322+
coo_container,
323+
n_samples,
324+
n_features,
325+
n_nonzeros,
326+
random_state=0,
327+
sparse_format="csr",
328+
)
275329
for SparseRandomProj in all_SparseRandomProjection:
276330
# when using sparse input, the projected data can be forced to be a
277331
# dense numpy array
278332
rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
279-
rp.fit(data)
280-
assert isinstance(rp.transform(data), np.ndarray)
281-
282-
sparse_data = sp.csr_matrix(data)
333+
rp.fit(dense_data)
334+
assert isinstance(rp.transform(dense_data), np.ndarray)
283335
assert isinstance(rp.transform(sparse_data), np.ndarray)
284336

285337
# the output can be left to a sparse matrix instead
286338
rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
287-
rp = rp.fit(data)
339+
rp = rp.fit(dense_data)
288340
# output for dense input will stay dense:
289-
assert isinstance(rp.transform(data), np.ndarray)
341+
assert isinstance(rp.transform(dense_data), np.ndarray)
290342

291343
# output for sparse output will be sparse:
292344
assert sp.issparse(rp.transform(sparse_data))
293345

294346

295-
def test_correct_RandomProjection_dimensions_embedding():
347+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
348+
def test_correct_RandomProjection_dimensions_embedding(
349+
coo_container, global_random_seed
350+
):
351+
data = make_sparse_random_data(
352+
coo_container,
353+
n_samples,
354+
n_features,
355+
n_nonzeros,
356+
random_state=global_random_seed,
357+
sparse_format=None,
358+
)
296359
for RandomProjection in all_RandomProjection:
297360
rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
298361

@@ -334,24 +397,52 @@ def test_correct_RandomProjection_dimensions_embedding():
334397
assert 85 < rp.components_.nnz # close to 1% density
335398

336399

337-
def test_warning_n_components_greater_than_n_features():
400+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
401+
def test_warning_n_components_greater_than_n_features(
402+
coo_container, global_random_seed
403+
):
338404
n_features = 20
339-
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
405+
n_samples = 5
406+
n_nonzeros = int(n_features / 4)
407+
data = make_sparse_random_data(
408+
coo_container,
409+
n_samples,
410+
n_features,
411+
n_nonzeros,
412+
random_state=global_random_seed,
413+
sparse_format=None,
414+
)
340415

341416
for RandomProjection in all_RandomProjection:
342417
with pytest.warns(DataDimensionalityWarning):
343418
RandomProjection(n_components=n_features + 1).fit(data)
344419

345420

346-
def test_works_with_sparse_data():
421+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
422+
def test_works_with_sparse_data(coo_container, global_random_seed):
347423
n_features = 20
348-
data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
424+
n_samples = 5
425+
n_nonzeros = int(n_features / 4)
426+
dense_data = make_sparse_random_data(
427+
coo_container,
428+
n_samples,
429+
n_features,
430+
n_nonzeros,
431+
random_state=global_random_seed,
432+
sparse_format=None,
433+
)
434+
sparse_data = make_sparse_random_data(
435+
coo_container,
436+
n_samples,
437+
n_features,
438+
n_nonzeros,
439+
random_state=global_random_seed,
440+
sparse_format="csr",
441+
)
349442

350443
for RandomProjection in all_RandomProjection:
351-
rp_dense = RandomProjection(n_components=3, random_state=1).fit(data)
352-
rp_sparse = RandomProjection(n_components=3, random_state=1).fit(
353-
sp.csr_matrix(data)
354-
)
444+
rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data)
445+
rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data)
355446
assert_array_almost_equal(
356447
densify(rp_dense.components_), densify(rp_sparse.components_)
357448
)
@@ -365,8 +456,19 @@ def test_johnson_lindenstrauss_min_dim():
365456
assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
366457

367458

459+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
368460
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
369-
def test_random_projection_feature_names_out(random_projection_cls):
461+
def test_random_projection_feature_names_out(
462+
coo_container, random_projection_cls, global_random_seed
463+
):
464+
data = make_sparse_random_data(
465+
coo_container,
466+
n_samples,
467+
n_features,
468+
n_nonzeros,
469+
random_state=global_random_seed,
470+
sparse_format=None,
471+
)
370472
random_projection = random_projection_cls(n_components=2)
371473
random_projection.fit(data)
372474
names_out = random_projection.get_feature_names_out()
@@ -379,11 +481,13 @@ def test_random_projection_feature_names_out(random_projection_cls):
379481
assert_array_equal(names_out, expected_names_out)
380482

381483

484+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
382485
@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
383486
@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
384487
@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
385488
@pytest.mark.parametrize("compute_inverse_components", [True, False])
386489
def test_inverse_transform(
490+
coo_container,
387491
n_samples,
388492
n_features,
389493
random_projection_cls,
@@ -398,11 +502,21 @@ def test_inverse_transform(
398502
random_state=global_random_seed,
399503
)
400504

401-
X_dense, X_csr = make_sparse_random_data(
505+
X_dense = make_sparse_random_data(
506+
coo_container,
507+
n_samples,
508+
n_features,
509+
n_nonzeros=n_samples * n_features // 100 + 1,
510+
random_state=global_random_seed,
511+
sparse_format=None,
512+
)
513+
X_csr = make_sparse_random_data(
514+
coo_container,
402515
n_samples,
403516
n_features,
404-
n_samples * n_features // 100 + 1,
517+
n_nonzeros=n_samples * n_features // 100 + 1,
405518
random_state=global_random_seed,
519+
sparse_format="csr",
406520
)
407521

408522
for X in [X_dense, X_csr]:

0 commit comments

Comments
 (0)