Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG+1] Add 'axis' argument to sparsefuncs.mean_variance_axis #3622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/developers/utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ Efficient Routines for Sparse Matrices
The ``sklearn.utils.sparsefuncs`` cython module hosts compiled extensions to
efficiently process ``scipy.sparse`` data.

- :func:`sparsefuncs.mean_variance_axis0`: compute the means and
variances along axis 0 of a CSR matrix.
- :func:`sparsefuncs.mean_variance_axis`: compute the means and
variances along a specified axis of a CSR matrix.
Used for normalizing the tolerance stopping criterion in
:class:`sklearn.cluster.k_means_.KMeans`.

Expand Down
4 changes: 2 additions & 2 deletions sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..metrics.pairwise import euclidean_distances
from ..utils.extmath import row_norms, squared_norm
from ..utils.sparsefuncs_fast import assign_rows_csr
from ..utils.sparsefuncs import mean_variance_axis0
from ..utils.sparsefuncs import mean_variance_axis
from ..utils.fixes import astype
from ..utils import check_array
from ..utils import check_random_state
Expand Down Expand Up @@ -141,7 +141,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
def _tolerance(X, tol):
"""Return a tolerance which is independent of the dataset"""
if sp.issparse(X):
variances = mean_variance_axis0(X)[1]
variances = mean_variance_axis(X, axis=0)[1]
else:
variances = np.var(X, axis=0)
return np.mean(variances) * tol
Expand Down
4 changes: 2 additions & 2 deletions sklearn/decomposition/truncated_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array, as_float_array, check_random_state
from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
from ..utils.sparsefuncs import mean_variance_axis0
from ..utils.sparsefuncs import mean_variance_axis

__all__ = ["TruncatedSVD"]

Expand Down Expand Up @@ -175,7 +175,7 @@ def fit_transform(self, X, y=None):
X_transformed = np.dot(U, np.diag(Sigma))
self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
if sp.issparse(X):
_, full_var = mean_variance_axis0(X)
_, full_var = mean_variance_axis(X, axis=0)
full_var = full_var.sum()
else:
full_var = np.var(X, axis=0).sum()
Expand Down
6 changes: 3 additions & 3 deletions sklearn/feature_selection/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..base import BaseEstimator
from .base import SelectorMixin
from ..utils import check_array
from ..utils.sparsefuncs_fast import csr_mean_variance_axis0
from ..utils.sparsefuncs import mean_variance_axis


class VarianceThreshold(BaseEstimator, SelectorMixin):
Expand Down Expand Up @@ -58,10 +58,10 @@ def fit(self, X, y=None):
-------
self
"""
X = check_array(X, 'csr', dtype=np.float64)
X = check_array(X, ('csr', 'csc'), dtype=np.float64)

if hasattr(X, "toarray"): # sparse matrix
_, self.variances_ = csr_mean_variance_axis0(X)
_, self.variances_ = mean_variance_axis(X, axis=0)
else:
self.variances_ = np.var(X, axis=0)

Expand Down
6 changes: 3 additions & 3 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
from ..utils import as_float_array, check_array
from ..utils.extmath import safe_sparse_dot
from ..utils.sparsefuncs import mean_variance_axis0, inplace_column_scale
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale


###
Expand All @@ -48,14 +48,14 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
if fit_intercept:
# we might require not to change the csr matrix sometimes
# store a copy if normalize is True.
# Change dtype to float64 since mean_variance_axis0 accepts
# Change dtype to float64 since mean_variance_axis accepts
# it that way.
if sp.isspmatrix(X) and X.getformat() == 'csr':
X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
else:
X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)

X_mean, X_var = mean_variance_axis0(X)
X_mean, X_var = mean_variance_axis(X, axis=0)
if normalize:
# transform variance to std in-place
# XXX: currently scaled to variance=n_samples to match center_data
Expand Down
11 changes: 5 additions & 6 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
from ..utils import warn_if_not_float
from ..utils.extmath import row_norms
from ..utils.fixes import combinations_with_replacement as combinations_w_r
from ..utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
from ..utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
from ..utils.sparsefuncs import inplace_column_scale
from ..utils.sparsefuncs import mean_variance_axis0
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)

zip = six.moves.zip
map = six.moves.map
Expand Down Expand Up @@ -124,7 +123,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
copy = False
if copy:
X = X.copy()
_, var = mean_variance_axis0(X)
_, var = mean_variance_axis(X, axis=0)
var[var == 0.0] = 1.0
inplace_column_scale(X, 1 / np.sqrt(var))
else:
Expand Down Expand Up @@ -319,7 +318,7 @@ def fit(self, X, y=None):
self.mean_ = None

if self.with_std:
var = mean_variance_axis0(X)[1]
var = mean_variance_axis(X, axis=0)[1]
self.std_ = np.sqrt(var)
self.std_[var == 0.0] = 1.0
else:
Expand Down
10 changes: 5 additions & 5 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_warns

from sklearn.utils.sparsefuncs import mean_variance_axis0
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.preprocessing.data import _transform_selected
from sklearn.preprocessing.data import Binarizer
from sklearn.preprocessing.data import KernelCenterer
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_scaler_without_centering():
X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

Expand Down Expand Up @@ -349,8 +349,8 @@ def test_scaler_int():
[0., 1.109, 1.856, 21., 1.559], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
X_csr_scaled.astype(np.float))
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(
X_csr_scaled.astype(np.float), 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

Expand Down Expand Up @@ -432,7 +432,7 @@ def test_scale_function_without_centering():
# Check that X has not been copied
assert_true(X_scaled is not X)

X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

Expand Down
28 changes: 22 additions & 6 deletions sklearn/utils/sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import numpy as np

from .fixes import sparse_min_max
from .sparsefuncs_fast import (csr_mean_variance_axis0,
csc_mean_variance_axis0)
from .sparsefuncs_fast import csr_mean_variance_axis0 as _csr_mean_var_axis0
from .sparsefuncs_fast import csc_mean_variance_axis0 as _csc_mean_var_axis0


def _raise_typeerror(X):
Expand Down Expand Up @@ -53,14 +53,17 @@ def inplace_csr_row_scale(X, scale):
X.data *= np.repeat(scale, np.diff(X.indptr))


def mean_variance_axis0(X):
def mean_variance_axis(X, axis):
"""Compute mean and variance along axis 0 on a CSR or CSC matrix

Parameters
----------
X: CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.

axis: int (either 0 or 1)
Axis along which the axis should be computed.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently, you also accept -1 and -2.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, out of consistency with other methods in sklearn (and scipy in general) that handle the axis argument this way as well (e.g. count_nonzero in the same file), but those function don't document that usage, either. I assumed this is an sklearn convention.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. see also https://github.com/scipy/scipy/blob/master/scipy/sparse/compressed.py which uses the same convention thoughout, but never documents it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whereas the numpy.matrix.std has a docstring that says:

Refer to `numpy.std` for full documentation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

grepping through the numpy and scipy codebases, it seems like the most common way is to describe this as "axis : int" without specifying which values are allowed (which makes sense for numpy given that an ndarray can have any number of axis), while the scipy.sparse module explicitly lists 0 and 1 as valid arguments (never -1 and -2, although the functions in questions do accept those values as well). Personally I think the way I documented it makes sense, as it's consistent with scipy.sparse

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count_nonzero is a backport from NumPy. We don't generally accept funny axes, since data is assumed to be 2-d almost everywhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what do you suggest would be the right thing to do? Remove -2/-1 as accepted values?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'd get rid of those. They're unlikely to be more useful than confusing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps more to the point, unlike scipy.sparse, utils here are not public.

On 3 September 2014 04:21, Lars Buitinck [email protected] wrote:

In sklearn/utils/sparsefuncs.py:

@@ -61,6 +61,9 @@ def mean_variance_axis0(X):
X: CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.

  • axis: int (either 0 or 1)
  •    Axis along which the axis should be computed.
    

Yes, I'd get rid of those. They're unlikely to be more useful than
confusing.


Reply to this email directly or view it on GitHub
https://github.com/scikit-learn/scikit-learn/pull/3622/files#r17004979.


Returns
-------

Expand All @@ -71,10 +74,20 @@ def mean_variance_axis0(X):
Feature-wise variances

"""
if axis not in (0, 1):
raise ValueError(
"Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be better if axis was the axis given by the user.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


if isinstance(X, sp.csr_matrix):
return csr_mean_variance_axis0(X)
if axis == 0:
return _csr_mean_var_axis0(X)
else:
return _csc_mean_var_axis0(X.T)
elif isinstance(X, sp.csc_matrix):
return csc_mean_variance_axis0(X)
if axis == 0:
return _csc_mean_var_axis0(X)
else:
return _csr_mean_var_axis0(X.T)
else:
_raise_typeerror(X)

Expand Down Expand Up @@ -258,13 +271,16 @@ def inplace_swap_column(X, m, n):


def min_max_axis(X, axis):
"""Compute minimum and maximum along axis 0 on a CSR or CSC matrix
"""Compute minimum and maximum along an axis on a CSR or CSC matrix

Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.

axis: int (either 0 or 1)
Axis along which the axis should be computed.

Returns
-------

Expand Down
59 changes: 52 additions & 7 deletions sklearn/utils/tests/test_sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from numpy.testing import assert_array_almost_equal, assert_array_equal

from sklearn.datasets import make_classification
from sklearn.utils.sparsefuncs import (mean_variance_axis0,
from sklearn.utils.sparsefuncs import (mean_variance_axis,
inplace_column_scale,
inplace_row_scale,
inplace_swap_row, inplace_swap_column,
Expand All @@ -26,27 +26,72 @@ def test_mean_variance_axis0():
X[1, 0] = 0
X_csr = sp.csr_matrix(X_lil)

X_means, X_vars = mean_variance_axis0(X_csr)
X_means, X_vars = mean_variance_axis(X_csr, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))

X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis0(X_csc)
X_means, X_vars = mean_variance_axis(X_csc, axis=0)

assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
assert_raises(TypeError, mean_variance_axis0, X_lil)
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)

X = X.astype(np.float32)
X_csr = X_csr.astype(np.float32)
X_csc = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis0(X_csr)
X_means, X_vars = mean_variance_axis(X_csr, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
X_means, X_vars = mean_variance_axis0(X_csc)
X_means, X_vars = mean_variance_axis(X_csc, axis=0)
assert_array_almost_equal(X_means, np.mean(X, axis=0))
assert_array_almost_equal(X_vars, np.var(X, axis=0))
assert_raises(TypeError, mean_variance_axis0, X_lil)
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)


def test_mean_variance_illegal_axis():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = sp.csr_matrix(X)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)

def test_mean_variance_axis1():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_lil = sp.lil_matrix(X)
X_lil[1, 0] = 0
X[1, 0] = 0
X_csr = sp.csr_matrix(X_lil)

X_means, X_vars = mean_variance_axis(X_csr, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))

X_csc = sp.csc_matrix(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis=1)

assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

X = X.astype(np.float32)
X_csr = X_csr.astype(np.float32)
X_csc = X_csr.astype(np.float32)
X_means, X_vars = mean_variance_axis(X_csr, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
X_means, X_vars = mean_variance_axis(X_csc, axis=1)
assert_array_almost_equal(X_means, np.mean(X, axis=1))
assert_array_almost_equal(X_vars, np.var(X, axis=1))
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)


def test_densify_rows():
Expand Down