-
-
Notifications
You must be signed in to change notification settings - Fork 26.5k
Closed
Description
Description
IncrementalPCA is by design suited to application to sparse data in a way that most PCA classes are not. However, it is not written to accept this by default.
Steps/Code to Reproduce
import numpy as np
from sklearn.decomposition import IncrementalPCA
from scipy import sparse
pca_op = IncrementalPCA(batch_size=10)
X = np.random.poisson(0.2, [100, 100])
for m in [sparse.csc_matrix, sparse.csr_matrix, sparse.dok_matrix, sparse.lil_matrix]:
pca_op.fit_transform(m(X))
Expected Results
No error should be thrown.
Actual Results
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/base.py", line 464, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/decomposition/incremental_pca.py", line 191, in fit
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/utils/validation.py", line 517, in check_array
accept_large_sparse=accept_large_sparse)
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/utils/validation.py", line 318, in _ensure_sparse_format
raise TypeError('A sparse matrix was passed, but dense '
TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
Suggested fix
import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.utils import check_array, gen_batches
from scipy import sparse
class IncrementalPCA(IncrementalPCA):
def fit(self, X, y=None):
self.components_ = None
self.n_samples_seen_ = 0
self.mean_ = .0
self.var_ = .0
self.singular_values_ = None
self.explained_variance_ = None
self.explained_variance_ratio_ = None
self.singular_values_ = None
self.noise_variance_ = None
X = check_array(X, accept_sparse=['csr', 'csc', 'dok', 'lil'], copy=self.copy, dtype=[np.float64, np.float32])
n_samples, n_features = X.shape
if self.batch_size is None:
self.batch_size_ = 5 * n_features
else:
self.batch_size_ = self.batch_size
for batch in gen_batches(n_samples, self.batch_size_,
min_batch_size=self.n_components or 0):
self.partial_fit(X[batch], check_input=True)
return self
def partial_fit(self, X, y=None, check_input=True):
if check_input and sparse.issparse(X):
X = X.toarray()
super().partial_fit(X, y=y, check_input=check_input)
def transform(self, X):
n_samples = X.shape[0]
output = []
for batch in gen_batches(n_samples, self.batch_size_,
min_batch_size=self.n_components or 0):
X_batch = X[batch]
if sparse.issparse(X_batch):
X_batch = X_batch.toarray()
output.append(super().transform(X_batch))
return np.vstack(output)
pca_op = IncrementalPCA(batch_size=10)
X = np.random.poisson(0.2, [100, 100])
for m in [sparse.csc_matrix, sparse.csr_matrix, sparse.dok_matrix, sparse.lil_matrix]:
pca_op.fit_transform(m(X))
I'd be happy to submit this as a PR if it's desirable.
Versions
Details
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/base.py", line 464, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/decomposition/incremental_pca.py", line 191, in fit
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/utils/validation.py", line 517, in check_array
accept_large_sparse=accept_large_sparse)
File "/home/scottgigante/.local/lib/python3.5/site-packages/sklearn/utils/validation.py", line 318, in _ensure_sparse_format
raise TypeError('A sparse matrix was passed, but dense '
TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
>>> import sklearn; sklearn.show_versions()
/home/scottgigante/.local/lib/python3.5/site-packages/numpy/distutils/system_info.py:638: UserWarning:
Atlas (http://math-atlas.sourceforge.net/) libraries not found.
Directories to search for the libraries can be specified in the
numpy/distutils/site.cfg file (section [atlas]) or by setting
the ATLAS environment variable.
self.calc_info()
/usr/bin/ld: cannot find -lcblas
collect2: error: ld returned 1 exit status
System:
executable: /usr/bin/python3
python: 3.5.2 (default, Nov 23 2017, 16:37:01) [GCC 5.4.0 20160609]
machine: Linux-4.4.0-17134-Microsoft-x86_64-with-Ubuntu-16.04-xenial
BLAS:
cblas_libs: cblas
lib_dirs: /usr/lib
macros: NO_ATLAS_INFO=1, HAVE_CBLAS=None
Python deps:
scipy: 1.2.1
pandas: 0.23.4
pip: 19.0.3
numpy: 1.16.2
Cython: None
sklearn: 0.20.3
setuptools: 40.8.0
Metadata
Metadata
Assignees
Labels
No labels