-
-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Closed
Closed
Copy link
Description
Describe the bug
LogisticRegression
with sag
and saga
solvers fails for very large CSR matrices. It seems that int64
indices are not supported.
X_train_one_hot
from the code snippet below has the following parameters:
<8469764x32308 sparse matrix of type '<class 'numpy.float64'>'
with 4776946896 stored elements in Compressed Sparse Row format>
Steps/Code to Reproduce
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(solver='saga', C=(10 ** (-3)), fit_intercept=False, n_jobs=40)
regressor.fit(X_train_one_hot, y_train.values)
Expected Results
No error
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<timed exec> in <module>
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py in fit(self, X, y, sample_weight)
1404 else:
1405 prefer = 'processes'
-> 1406 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1407 **_joblib_parallel_args(prefer=prefer))(
1408 path_func(X, y, pos_class=class_, Cs=[C_],
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/pool.py in get(self, timeout)
769 return self._value
770 else:
--> 771 raise self._value
772
773 def _set(self, i, obj):
~/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
593 def __call__(self, *args, **kwargs):
594 try:
--> 595 return self.func(*args, **kwargs)
596 except KeyboardInterrupt as e:
597 # We capture the KeyboardInterrupt and reraise it as
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
796 beta = (1. / C) * l1_ratio
797
--> 798 w0, n_iter_i, warm_start_sag = sag_solver(
799 X, target, sample_weight, loss, alpha,
800 beta, max_iter, tol,
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/linear_model/_sag.py in sag_solver(X, y, sample_weight, loss, alpha, beta, max_iter, tol, verbose, random_state, check_input, max_squared_sum, warm_start_mem, is_saga)
296 num_seen_init = 0
297
--> 298 dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
299
300 if max_squared_sum is None:
~/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/lib/python3.9/site-packages/sklearn/linear_model/_base.py in make_dataset(X, y, sample_weight, random_state)
91
92 if sp.issparse(X):
---> 93 dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight,
94 seed=seed)
95 intercept_decay = SPARSE_INTERCEPT_DECAY
sklearn/utils/_seq_dataset.pyx in sklearn.utils._seq_dataset.CSRDataset64.__cinit__()
ValueError: Buffer dtype mismatch, expected 'int' but got 'long'
Versions
System:
python: 3.9.5 (default, Jun 30 2021, 14:14:03) [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]
executable: /home/as-sonin/.cache/pypoetry/virtualenvs/market-making-O6Kn0-Jh-py3.9/bin/python
machine: Linux-3.10.0-1127.18.2.el7.x86_64-x86_64-with-glibc2.17
Python dependencies:
pip: 21.2.4
setuptools: 54.1.2
sklearn: 0.24.2
numpy: 1.20.3
scipy: 1.6.1
Cython: None
pandas: 1.3.2
matplotlib: 3.4.3
joblib: 1.0.1
threadpoolctl: 2.2.0
Built with OpenMP: True
Metadata
Metadata
Assignees
Labels
No labels