Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 60cc5b5

Browse files
jjerphanogrisellorentzenchrjeremiedbbthomasjpfan
authored
FEA Fused sparse-dense support for PairwiseDistancesReduction (#23585)
Co-authored-by: Olivier Grisel <[email protected]> Co-authored-by: Christian Lorentzen <[email protected]> Co-authored-by: Jérémie du Boisberranger <[email protected]> Co-authored-by: Thomas J. Fan <[email protected]> Co-authored-by: Meekail Zain <[email protected]>
1 parent bfe68b4 commit 60cc5b5

File tree

16 files changed

+525
-89
lines changed

16 files changed

+525
-89
lines changed

doc/whats_new/v1.2.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,29 @@ Changes impacting all modules
5151
second-pass algorithm.
5252
:pr:`23197` by :user:`Meekail Zain <micky774>`
5353

54+
- |Enhancement| Support for combinations of dense and sparse datasets pairs
55+
for all distance metrics and for float32 and float64 datasets has been added
56+
or has seen its performance improved for the following estimators:
57+
58+
- :func:`sklearn.metrics.pairwise_distances_argmin`
59+
- :func:`sklearn.metrics.pairwise_distances_argmin_min`
60+
- :class:`sklearn.cluster.AffinityPropagation`
61+
- :class:`sklearn.cluster.Birch`
62+
- :class:`sklearn.cluster.SpectralClustering`
63+
- :class:`sklearn.neighbors.KNeighborsClassifier`
64+
- :class:`sklearn.neighbors.KNeighborsRegressor`
65+
- :class:`sklearn.neighbors.RadiusNeighborsClassifier`
66+
- :class:`sklearn.neighbors.RadiusNeighborsRegressor`
67+
- :class:`sklearn.neighbors.LocalOutlierFactor`
68+
- :class:`sklearn.neighbors.NearestNeighbors`
69+
- :class:`sklearn.manifold.Isomap`
70+
- :class:`sklearn.manifold.TSNE`
71+
- :func:`sklearn.manifold.trustworthiness`
72+
73+
:pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion <jjerphan>`,
74+
:user:`Olivier Grisel <ogrisel>`, and `Thomas Fan`_.
75+
76+
5477
Changelog
5578
---------
5679

sklearn/manifold/_isomap.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,9 @@ def fit(self, X, y=None):
330330
331331
Parameters
332332
----------
333-
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
333+
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
334334
Sample data, shape = (n_samples, n_features), in the form of a
335-
numpy array, sparse graph, precomputed tree, or NearestNeighbors
335+
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
336336
object.
337337
338338
y : Ignored
@@ -352,7 +352,7 @@ def fit_transform(self, X, y=None):
352352
353353
Parameters
354354
----------
355-
X : {array-like, sparse graph, BallTree, KDTree}
355+
X : {array-like, sparse matrix, BallTree, KDTree}
356356
Training vector, where `n_samples` is the number of samples
357357
and `n_features` is the number of features.
358358
@@ -381,7 +381,7 @@ def transform(self, X):
381381
382382
Parameters
383383
----------
384-
X : array-like, shape (n_queries, n_features)
384+
X : {array-like, sparse matrix}, shape (n_queries, n_features)
385385
If neighbors_algorithm='precomputed', X is assumed to be a
386386
distance matrix or a sparse graph of shape
387387
(n_queries, n_samples_fit).

sklearn/manifold/_t_sne.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -461,11 +461,12 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"):
461461
462462
Parameters
463463
----------
464-
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
464+
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
465+
(n_samples, n_samples)
465466
If the metric is 'precomputed' X must be a square distance
466467
matrix. Otherwise it contains a sample per row.
467468
468-
X_embedded : ndarray of shape (n_samples, n_components)
469+
X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components)
469470
Embedding of the training data in low-dimensional space.
470471
471472
n_neighbors : int, default=5
@@ -1095,7 +1096,8 @@ def fit_transform(self, X, y=None):
10951096
10961097
Parameters
10971098
----------
1098-
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
1099+
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
1100+
(n_samples, n_samples)
10991101
If the metric is 'precomputed' X must be a square distance
11001102
matrix. Otherwise it contains a sample per row. If the method
11011103
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
@@ -1121,7 +1123,8 @@ def fit(self, X, y=None):
11211123
11221124
Parameters
11231125
----------
1124-
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
1126+
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
1127+
(n_samples, n_samples)
11251128
If the metric is 'precomputed' X must be a square distance
11261129
matrix. Otherwise it contains a sample per row. If the method
11271130
is 'exact', X may be a sparse matrix of type 'csr', 'csc'

sklearn/manifold/tests/test_isomap.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -216,19 +216,21 @@ def test_isomap_clone_bug():
216216
assert model.nbrs_.n_neighbors == n_neighbors
217217

218218

219-
def test_sparse_input():
219+
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
220+
@pytest.mark.parametrize("path_method", path_methods)
221+
def test_sparse_input(eigen_solver, path_method):
222+
# TODO: compare results on dense and sparse data as proposed in:
223+
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
220224
X = sparse_rand(100, 3, density=0.1, format="csr")
221225

222-
# Should not error
223-
for eigen_solver in eigen_solvers:
224-
for path_method in path_methods:
225-
clf = manifold.Isomap(
226-
n_components=2,
227-
eigen_solver=eigen_solver,
228-
path_method=path_method,
229-
n_neighbors=8,
230-
)
231-
clf.fit(X)
226+
clf = manifold.Isomap(
227+
n_components=2,
228+
eigen_solver=eigen_solver,
229+
path_method=path_method,
230+
n_neighbors=8,
231+
)
232+
clf.fit(X)
233+
clf.transform(X)
232234

233235

234236
def test_isomap_fit_precomputed_radius_graph():

sklearn/manifold/tests/test_t_sne.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,9 @@ def test_optimization_minimizes_kl_divergence():
329329

330330

331331
@pytest.mark.parametrize("method", ["exact", "barnes_hut"])
332-
def test_fit_csr_matrix(method):
332+
def test_fit_transform_csr_matrix(method):
333+
# TODO: compare results on dense and sparse data as proposed in:
334+
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
333335
# X can be a sparse matrix.
334336
rng = check_random_state(0)
335337
X = rng.randn(50, 2)

sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ implementation_specific_values = [
55
#
66
# name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE
77
#
8-
# We also use the float64 dtype and C-type names as defined in
9-
# `sklearn.utils._typedefs` to maintain consistency.
8+
# We use DistanceMetric for float64 for backward naming compatibility.
109
#
1110
('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'),
1211
('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32')
@@ -15,14 +14,16 @@ implementation_specific_values = [
1514
}}
1615
cimport numpy as cnp
1716

18-
from ...utils._typedefs cimport DTYPE_t, ITYPE_t
17+
from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
1918
from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32
2019

2120
{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
2221

2322

2423
cdef class DatasetsPair{{name_suffix}}:
25-
cdef {{DistanceMetric}} distance_metric
24+
cdef:
25+
{{DistanceMetric}} distance_metric
26+
ITYPE_t n_features
2627

2728
cdef ITYPE_t n_samples_X(self) nogil
2829

@@ -37,5 +38,35 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
3738
cdef:
3839
const {{INPUT_DTYPE_t}}[:, ::1] X
3940
const {{INPUT_DTYPE_t}}[:, ::1] Y
40-
ITYPE_t d
41+
42+
43+
cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
44+
cdef:
45+
const {{INPUT_DTYPE_t}}[:] X_data
46+
const SPARSE_INDEX_TYPE_t[:] X_indices
47+
const SPARSE_INDEX_TYPE_t[:] X_indptr
48+
49+
const {{INPUT_DTYPE_t}}[:] Y_data
50+
const SPARSE_INDEX_TYPE_t[:] Y_indices
51+
const SPARSE_INDEX_TYPE_t[:] Y_indptr
52+
53+
54+
cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
55+
cdef:
56+
const {{INPUT_DTYPE_t}}[:] X_data
57+
const SPARSE_INDEX_TYPE_t[:] X_indices
58+
const SPARSE_INDEX_TYPE_t[:] X_indptr
59+
60+
const {{INPUT_DTYPE_t}}[:] Y_data
61+
const SPARSE_INDEX_TYPE_t[:] Y_indices
62+
ITYPE_t n_Y
63+
64+
65+
cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
66+
cdef:
67+
# As distance metrics are commutative, we can simply rely
68+
# on the implementation of SparseDenseDatasetsPair and
69+
# swap arguments.
70+
DatasetsPair{{name_suffix}} datasets_pair
71+
4172
{{endfor}}

0 commit comments

Comments
 (0)