6
6
import pytest
7
7
import scipy .sparse as sp
8
8
9
- from sklearn .exceptions import DataDimensionalityWarning
9
+ from sklearn .exceptions import DataDimensionalityWarning , NotFittedError
10
10
from sklearn .metrics import euclidean_distances
11
11
from sklearn .random_projection import (
12
12
GaussianRandomProjection ,
22
22
assert_array_almost_equal ,
23
23
assert_array_equal ,
24
24
)
25
+ from sklearn .utils .fixes import COO_CONTAINERS
25
26
26
27
all_sparse_random_matrix : List [Any ] = [_sparse_random_matrix ]
27
28
all_dense_random_matrix : List [Any ] = [_gaussian_random_matrix ]
32
33
all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
33
34
34
35
35
- # Make some random data with uniformly located non zero entries with
36
- # Gaussian distributed values
37
- def make_sparse_random_data (n_samples , n_features , n_nonzeros , random_state = 0 ):
36
+ def make_sparse_random_data (
37
+ coo_container ,
38
+ n_samples ,
39
+ n_features ,
40
+ n_nonzeros ,
41
+ random_state = None ,
42
+ sparse_format = "csr" ,
43
+ ):
44
+ """Make some random data with uniformly located non zero entries with
45
+ Gaussian distributed values; `sparse_format` can be `"csr"` (default) or
46
+ `None` (in which case a dense array is returned).
47
+ """
38
48
rng = np .random .RandomState (random_state )
39
- data_coo = sp . coo_matrix (
49
+ data_coo = coo_container (
40
50
(
41
51
rng .randn (n_nonzeros ),
42
52
(
@@ -46,7 +56,10 @@ def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
46
56
),
47
57
shape = (n_samples , n_features ),
48
58
)
49
- return data_coo .toarray (), data_coo .tocsr ()
59
+ if sparse_format is not None :
60
+ return data_coo .asformat (sparse_format )
61
+ else :
62
+ return data_coo .toarray ()
50
63
51
64
52
65
def densify (matrix ):
@@ -58,7 +71,6 @@ def densify(matrix):
58
71
59
72
n_samples , n_features = (10 , 1000 )
60
73
n_nonzeros = int (n_samples * n_features / 100.0 )
61
- data , data_csr = make_sparse_random_data (n_samples , n_features , n_nonzeros )
62
74
63
75
64
76
###############################################################################
@@ -221,14 +233,31 @@ def test_random_projection_transformer_invalid_input():
221
233
RandomProjection (n_components = n_components ).fit (fit_data )
222
234
223
235
224
- def test_try_to_transform_before_fit ():
236
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
237
+ def test_try_to_transform_before_fit (coo_container , global_random_seed ):
238
+ data = make_sparse_random_data (
239
+ coo_container ,
240
+ n_samples ,
241
+ n_features ,
242
+ n_nonzeros ,
243
+ random_state = global_random_seed ,
244
+ sparse_format = None ,
245
+ )
225
246
for RandomProjection in all_RandomProjection :
226
- with pytest .raises (ValueError ):
247
+ with pytest .raises (NotFittedError ):
227
248
RandomProjection (n_components = "auto" ).transform (data )
228
249
229
250
230
- def test_too_many_samples_to_find_a_safe_embedding ():
231
- data , _ = make_sparse_random_data (1000 , 100 , 1000 )
251
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
252
+ def test_too_many_samples_to_find_a_safe_embedding (coo_container , global_random_seed ):
253
+ data = make_sparse_random_data (
254
+ coo_container ,
255
+ n_samples = 1000 ,
256
+ n_features = 100 ,
257
+ n_nonzeros = 1000 ,
258
+ random_state = global_random_seed ,
259
+ sparse_format = None ,
260
+ )
232
261
233
262
for RandomProjection in all_RandomProjection :
234
263
rp = RandomProjection (n_components = "auto" , eps = 0.1 )
@@ -241,8 +270,16 @@ def test_too_many_samples_to_find_a_safe_embedding():
241
270
rp .fit (data )
242
271
243
272
244
- def test_random_projection_embedding_quality ():
245
- data , _ = make_sparse_random_data (8 , 5000 , 15000 )
273
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
274
+ def test_random_projection_embedding_quality (coo_container ):
275
+ data = make_sparse_random_data (
276
+ coo_container ,
277
+ n_samples = 8 ,
278
+ n_features = 5000 ,
279
+ n_nonzeros = 15000 ,
280
+ random_state = 0 ,
281
+ sparse_format = None ,
282
+ )
246
283
eps = 0.2
247
284
248
285
original_distances = euclidean_distances (data , squared = True )
@@ -271,28 +308,54 @@ def test_random_projection_embedding_quality():
271
308
assert 1 - eps < distances_ratio .min ()
272
309
273
310
274
- def test_SparseRandomProj_output_representation ():
311
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
312
+ def test_SparseRandomProj_output_representation (coo_container ):
313
+ dense_data = make_sparse_random_data (
314
+ coo_container ,
315
+ n_samples ,
316
+ n_features ,
317
+ n_nonzeros ,
318
+ random_state = 0 ,
319
+ sparse_format = None ,
320
+ )
321
+ sparse_data = make_sparse_random_data (
322
+ coo_container ,
323
+ n_samples ,
324
+ n_features ,
325
+ n_nonzeros ,
326
+ random_state = 0 ,
327
+ sparse_format = "csr" ,
328
+ )
275
329
for SparseRandomProj in all_SparseRandomProjection :
276
330
# when using sparse input, the projected data can be forced to be a
277
331
# dense numpy array
278
332
rp = SparseRandomProj (n_components = 10 , dense_output = True , random_state = 0 )
279
- rp .fit (data )
280
- assert isinstance (rp .transform (data ), np .ndarray )
281
-
282
- sparse_data = sp .csr_matrix (data )
333
+ rp .fit (dense_data )
334
+ assert isinstance (rp .transform (dense_data ), np .ndarray )
283
335
assert isinstance (rp .transform (sparse_data ), np .ndarray )
284
336
285
337
# the output can be left to a sparse matrix instead
286
338
rp = SparseRandomProj (n_components = 10 , dense_output = False , random_state = 0 )
287
- rp = rp .fit (data )
339
+ rp = rp .fit (dense_data )
288
340
# output for dense input will stay dense:
289
- assert isinstance (rp .transform (data ), np .ndarray )
341
+ assert isinstance (rp .transform (dense_data ), np .ndarray )
290
342
291
343
# output for sparse output will be sparse:
292
344
assert sp .issparse (rp .transform (sparse_data ))
293
345
294
346
295
- def test_correct_RandomProjection_dimensions_embedding ():
347
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
348
+ def test_correct_RandomProjection_dimensions_embedding (
349
+ coo_container , global_random_seed
350
+ ):
351
+ data = make_sparse_random_data (
352
+ coo_container ,
353
+ n_samples ,
354
+ n_features ,
355
+ n_nonzeros ,
356
+ random_state = global_random_seed ,
357
+ sparse_format = None ,
358
+ )
296
359
for RandomProjection in all_RandomProjection :
297
360
rp = RandomProjection (n_components = "auto" , random_state = 0 , eps = 0.5 ).fit (data )
298
361
@@ -334,24 +397,52 @@ def test_correct_RandomProjection_dimensions_embedding():
334
397
assert 85 < rp .components_ .nnz # close to 1% density
335
398
336
399
337
- def test_warning_n_components_greater_than_n_features ():
400
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
401
+ def test_warning_n_components_greater_than_n_features (
402
+ coo_container , global_random_seed
403
+ ):
338
404
n_features = 20
339
- data , _ = make_sparse_random_data (5 , n_features , int (n_features / 4 ))
405
+ n_samples = 5
406
+ n_nonzeros = int (n_features / 4 )
407
+ data = make_sparse_random_data (
408
+ coo_container ,
409
+ n_samples ,
410
+ n_features ,
411
+ n_nonzeros ,
412
+ random_state = global_random_seed ,
413
+ sparse_format = None ,
414
+ )
340
415
341
416
for RandomProjection in all_RandomProjection :
342
417
with pytest .warns (DataDimensionalityWarning ):
343
418
RandomProjection (n_components = n_features + 1 ).fit (data )
344
419
345
420
346
- def test_works_with_sparse_data ():
421
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
422
+ def test_works_with_sparse_data (coo_container , global_random_seed ):
347
423
n_features = 20
348
- data , _ = make_sparse_random_data (5 , n_features , int (n_features / 4 ))
424
+ n_samples = 5
425
+ n_nonzeros = int (n_features / 4 )
426
+ dense_data = make_sparse_random_data (
427
+ coo_container ,
428
+ n_samples ,
429
+ n_features ,
430
+ n_nonzeros ,
431
+ random_state = global_random_seed ,
432
+ sparse_format = None ,
433
+ )
434
+ sparse_data = make_sparse_random_data (
435
+ coo_container ,
436
+ n_samples ,
437
+ n_features ,
438
+ n_nonzeros ,
439
+ random_state = global_random_seed ,
440
+ sparse_format = "csr" ,
441
+ )
349
442
350
443
for RandomProjection in all_RandomProjection :
351
- rp_dense = RandomProjection (n_components = 3 , random_state = 1 ).fit (data )
352
- rp_sparse = RandomProjection (n_components = 3 , random_state = 1 ).fit (
353
- sp .csr_matrix (data )
354
- )
444
+ rp_dense = RandomProjection (n_components = 3 , random_state = 1 ).fit (dense_data )
445
+ rp_sparse = RandomProjection (n_components = 3 , random_state = 1 ).fit (sparse_data )
355
446
assert_array_almost_equal (
356
447
densify (rp_dense .components_ ), densify (rp_sparse .components_ )
357
448
)
@@ -365,8 +456,19 @@ def test_johnson_lindenstrauss_min_dim():
365
456
assert johnson_lindenstrauss_min_dim (100 , eps = 1e-5 ) == 368416070986
366
457
367
458
459
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
368
460
@pytest .mark .parametrize ("random_projection_cls" , all_RandomProjection )
369
- def test_random_projection_feature_names_out (random_projection_cls ):
461
+ def test_random_projection_feature_names_out (
462
+ coo_container , random_projection_cls , global_random_seed
463
+ ):
464
+ data = make_sparse_random_data (
465
+ coo_container ,
466
+ n_samples ,
467
+ n_features ,
468
+ n_nonzeros ,
469
+ random_state = global_random_seed ,
470
+ sparse_format = None ,
471
+ )
370
472
random_projection = random_projection_cls (n_components = 2 )
371
473
random_projection .fit (data )
372
474
names_out = random_projection .get_feature_names_out ()
@@ -379,11 +481,13 @@ def test_random_projection_feature_names_out(random_projection_cls):
379
481
assert_array_equal (names_out , expected_names_out )
380
482
381
483
484
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
382
485
@pytest .mark .parametrize ("n_samples" , (2 , 9 , 10 , 11 , 1000 ))
383
486
@pytest .mark .parametrize ("n_features" , (2 , 9 , 10 , 11 , 1000 ))
384
487
@pytest .mark .parametrize ("random_projection_cls" , all_RandomProjection )
385
488
@pytest .mark .parametrize ("compute_inverse_components" , [True , False ])
386
489
def test_inverse_transform (
490
+ coo_container ,
387
491
n_samples ,
388
492
n_features ,
389
493
random_projection_cls ,
@@ -398,11 +502,21 @@ def test_inverse_transform(
398
502
random_state = global_random_seed ,
399
503
)
400
504
401
- X_dense , X_csr = make_sparse_random_data (
505
+ X_dense = make_sparse_random_data (
506
+ coo_container ,
507
+ n_samples ,
508
+ n_features ,
509
+ n_nonzeros = n_samples * n_features // 100 + 1 ,
510
+ random_state = global_random_seed ,
511
+ sparse_format = None ,
512
+ )
513
+ X_csr = make_sparse_random_data (
514
+ coo_container ,
402
515
n_samples ,
403
516
n_features ,
404
- n_samples * n_features // 100 + 1 ,
517
+ n_nonzeros = n_samples * n_features // 100 + 1 ,
405
518
random_state = global_random_seed ,
519
+ sparse_format = "csr" ,
406
520
)
407
521
408
522
for X in [X_dense , X_csr ]:
0 commit comments