Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 27736ac

Browse files
jjerphanlestevethomasjpfan
authored andcommitted
FIX Better support large or read-only datasets in decomposition.DictionaryLearning (#25172)
Co-authored-by: Loïc Esteve <[email protected]> Co-authored-by: Thomas J. Fan <[email protected]>
1 parent f795186 commit 27736ac

File tree

5 files changed

+59
-18
lines changed

5 files changed

+59
-18
lines changed

doc/whats_new/v1.2.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ Changelog
3030
The ARFF specs requires to ignore the leading space.
3131
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.
3232

33+
:mod:`sklearn.decomposition`
34+
............................
35+
36+
- |Fix| :class:`decomposition.DictionaryLearning` better supports readonly NumPy
37+
arrays. In particular, it better supports large datasets which are memory-mapped
38+
when it is used with coordinate descent algorithms (i.e. when `fit_algorithm='cd'`).
39+
:pr:`25172` by :user:`Julien Jerphanion <jjerphan>`.
40+
3341
:mod:`sklearn.ensemble`
3442
.......................
3543

sklearn/decomposition/_dict_learning.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,13 @@ def _sparse_encode(
174174
)
175175

176176
if init is not None:
177+
# In some workflows using coordinate descent algorithms:
178+
# - users might provide NumPy arrays with read-only buffers
179+
# - `joblib` might memmap arrays making their buffer read-only
180+
# TODO: move this handling (which is currently too broad)
181+
# closer to the actual private function which need buffers to be writable.
182+
if not init.flags["WRITEABLE"]:
183+
init = np.array(init)
177184
clf.coef_ = init
178185

179186
clf.fit(dictionary.T, X.T, check_input=check_input)

sklearn/decomposition/tests/test_dict_learning.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from functools import partial
66
import itertools
77

8+
from joblib import Parallel
9+
10+
import sklearn
11+
812
from sklearn.base import clone
913

1014
from sklearn.exceptions import ConvergenceWarning
@@ -1038,6 +1042,29 @@ def test_get_feature_names_out(estimator):
10381042
)
10391043

10401044

1045+
def test_cd_work_on_joblib_memmapped_data(monkeypatch):
1046+
monkeypatch.setattr(
1047+
sklearn.decomposition._dict_learning,
1048+
"Parallel",
1049+
partial(Parallel, max_nbytes=100),
1050+
)
1051+
1052+
rng = np.random.RandomState(0)
1053+
X_train = rng.randn(10, 10)
1054+
1055+
dict_learner = DictionaryLearning(
1056+
n_components=5,
1057+
random_state=0,
1058+
n_jobs=2,
1059+
fit_algorithm="cd",
1060+
max_iter=50,
1061+
verbose=True,
1062+
)
1063+
1064+
# This must run and complete without error.
1065+
dict_learner.fit(X_train)
1066+
1067+
10411068
# TODO(1.4) remove
10421069
def test_minibatch_dictionary_learning_warns_and_ignore_n_iter():
10431070
"""Check that we always raise a warning when `n_iter` is set even if it is

sklearn/linear_model/_cd_fast.pyx

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -90,13 +90,13 @@ cdef floating diff_abs_max(int n, floating* a, floating* b) nogil:
9090
m = d
9191
return m
9292

93-
93+
# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
9494
def enet_coordinate_descent(
95-
floating[::1] w,
95+
cnp.ndarray[floating, ndim=1, mode='c'] w,
9696
floating alpha,
9797
floating beta,
98-
floating[::1, :] X,
99-
floating[::1] y,
98+
cnp.ndarray[floating, ndim=2, mode='fortran'] X,
99+
cnp.ndarray[floating, ndim=1, mode='c'] y,
100100
unsigned int max_iter,
101101
floating tol,
102102
object rng,
@@ -273,16 +273,17 @@ def enet_coordinate_descent(
273273
return np.asarray(w), gap, tol, n_iter + 1
274274

275275

276+
# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
276277
def sparse_enet_coordinate_descent(
277-
floating [::1] w,
278+
cnp.ndarray[floating, ndim=1, mode='c'] w,
278279
floating alpha,
279280
floating beta,
280-
floating[::1] X_data, # TODO: Make const after release of Cython 3 (#23147)
281+
cnp.ndarray[floating, ndim=1, mode='c'] X_data,
281282
const int[::1] X_indices,
282283
const int[::1] X_indptr,
283-
floating[::1] y,
284-
floating[::1] sample_weight,
285-
floating[::1] X_mean,
284+
cnp.ndarray[floating, ndim=1, mode='c'] y,
285+
cnp.ndarray[floating, ndim=1, mode='c'] sample_weight,
286+
cnp.ndarray[floating, ndim=1, mode='c'] X_mean,
286287
unsigned int max_iter,
287288
floating tol,
288289
object rng,
@@ -564,8 +565,9 @@ def sparse_enet_coordinate_descent(
564565
return np.asarray(w), gap, tol, n_iter + 1
565566

566567

568+
# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
567569
def enet_coordinate_descent_gram(
568-
floating[::1] w,
570+
cnp.ndarray[floating, ndim=1, mode='c'] w,
569571
floating alpha,
570572
floating beta,
571573
cnp.ndarray[floating, ndim=2, mode='c'] Q,
@@ -630,9 +632,9 @@ def enet_coordinate_descent_gram(
630632
cdef UINT32_t* rand_r_state = &rand_r_state_seed
631633

632634
cdef floating y_norm2 = np.dot(y, y)
633-
cdef floating* w_ptr = <floating*>&w[0]
635+
cdef floating* w_ptr = &w[0]
634636
cdef floating* Q_ptr = &Q[0, 0]
635-
cdef floating* q_ptr = <floating*>q.data
637+
cdef floating* q_ptr = &q[0]
636638
cdef floating* H_ptr = &H[0]
637639
cdef floating* XtA_ptr = &XtA[0]
638640
tol = tol * y_norm2
@@ -734,9 +736,9 @@ def enet_coordinate_descent_gram(
734736

735737
return np.asarray(w), gap, tol, n_iter + 1
736738

737-
739+
# TODO: use const fused typed memoryview where possible when Cython 0.29.33 is used.
738740
def enet_coordinate_descent_multi_task(
739-
floating[::1, :] W,
741+
cnp.ndarray[floating, ndim=2, mode='fortran'] W,
740742
floating l1_reg,
741743
floating l2_reg,
742744
# TODO: use const qualified fused-typed memoryview when Cython 3.0 is used.

sklearn/linear_model/_coordinate_descent.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
check_is_fitted,
3131
column_or_1d,
3232
)
33-
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
3433
from ..utils.fixes import delayed
3534

3635
# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
@@ -594,9 +593,7 @@ def enet_path(
594593
w=coef_,
595594
alpha=l1_reg,
596595
beta=l2_reg,
597-
X_data=ReadonlyArrayWrapper(
598-
X.data
599-
), # TODO: Remove after release of Cython 3 (#23147)
596+
X_data=X.data,
600597
X_indices=X.indices,
601598
X_indptr=X.indptr,
602599
y=y,

0 commit comments

Comments
 (0)