Closed
Description
I ran into an issue when trying to construct a ploynomial expansion feature with a large sparse matrix input:
[1] x = sp.sparse.rand(10000, 120006, density=0.000004)
[2] x
>>> <10000x120006 sparse matrix of type '<class 'numpy.float64'>'
with 4800 stored elements in COOrdinate format>
[2] from sklearn.preprocessing import PolynomialFeatures
[3] pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
[4] xinter = pf.fit_transform(x)
And got the error ValueError: negative column index found
:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-dc5dc18d59d2> in <module>
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-55-a32d56bebd65> in <module>
----> 1 xinter = pf.fit_transform(x)
~/anaconda2/envs/py37/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
688 if y is None:
689 # fit method of arity 1 (unsupervised transformation)
--> 690 return self.fit(X, **fit_params).transform(X)
691 else:
692 # fit method of arity 2 (supervised transformation)
~/anaconda2/envs/py37/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in transform(self, X)
1571 break
1572 to_stack.append(Xp_next)
-> 1573 XP = sparse.hstack(to_stack, format='csr')
1574 elif sparse.isspmatrix_csc(X) and self.degree < 4:
1575 return self.transform(X.tocsr()).tocsc()
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
572 for j in range(N):
573 if blocks[i,j] is not None:
--> 574 A = coo_matrix(blocks[i,j])
575 blocks[i,j] = A
576 block_mask[i,j] = True
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
170 self._shape = check_shape(arg1.shape)
171 else:
--> 172 coo = arg1.tocoo()
173 self.row = coo.row
174 self.col = coo.col
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/compressed.py in tocoo(self, copy)
1016 from .coo import coo_matrix
1017 return coo_matrix((self.data, (row, col)), self.shape, copy=copy,
-> 1018 dtype=self.dtype)
1019
1020 tocoo.__doc__ = spmatrix.tocoo.__doc__
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
196 self.data = self.data.astype(dtype, copy=False)
197
--> 198 self._check()
199
200 def reshape(self, *args, **kwargs):
~/anaconda2/envs/py37/lib/python3.7/site-packages/scipy/sparse/coo.py in _check(self)
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ValueError: negative column index found
The problem is not with scipy
as it correctly set the index type to int64
:
> /venv/lib/python3.6/site-packages/scipy/sparse/coo.py(291)_check()
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ipdb> self.col.max()
2147482788
ipdb> self.col.dtype
dtype('int64')
ipdb> self.col.min()
-2147480639
And I believe the issue is with sklearn.preprocess._data.py
, which calls sklearn._csr_polynomial_expansion
, which in turn used an int32
for the c
code:
and:
I wondering if there is a quick fix for this. Thanks!