diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst index a661a539ba028..9f24e24fc65cf 100644 --- a/doc/developers/utilities.rst +++ b/doc/developers/utilities.rst @@ -140,8 +140,8 @@ Efficient Routines for Sparse Matrices The ``sklearn.utils.sparsefuncs`` cython module hosts compiled extensions to efficiently process ``scipy.sparse`` data. -- :func:`sparsefuncs.mean_variance_axis0`: compute the means and - variances along axis 0 of a CSR matrix. +- :func:`sparsefuncs.mean_variance_axis`: compute the means and + variances along a specified axis of a CSR matrix. Used for normalizing the tolerance stopping criterion in :class:`sklearn.cluster.k_means_.KMeans`. diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 413f4d12b0a07..f63549782dcb1 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -20,7 +20,7 @@ from ..metrics.pairwise import euclidean_distances from ..utils.extmath import row_norms, squared_norm from ..utils.sparsefuncs_fast import assign_rows_csr -from ..utils.sparsefuncs import mean_variance_axis0 +from ..utils.sparsefuncs import mean_variance_axis from ..utils.fixes import astype from ..utils import check_array from ..utils import check_random_state @@ -141,7 +141,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): def _tolerance(X, tol): """Return a tolerance which is independent of the dataset""" if sp.issparse(X): - variances = mean_variance_axis0(X)[1] + variances = mean_variance_axis(X, axis=0)[1] else: variances = np.var(X, axis=0) return np.mean(variances) * tol diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index 5e0d91dd04583..3b2033204e505 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -17,7 +17,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, as_float_array, check_random_state from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip -from ..utils.sparsefuncs import mean_variance_axis0 +from ..utils.sparsefuncs import mean_variance_axis __all__ = ["TruncatedSVD"] @@ -175,7 +175,7 @@ def fit_transform(self, X, y=None): X_transformed = np.dot(U, np.diag(Sigma)) self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) if sp.issparse(X): - _, full_var = mean_variance_axis0(X) + _, full_var = mean_variance_axis(X, axis=0) full_var = full_var.sum() else: full_var = np.var(X, axis=0).sum() diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 6fe65224aeed4..a60b1ace256ac 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -5,7 +5,7 @@ from ..base import BaseEstimator from .base import SelectorMixin from ..utils import check_array -from ..utils.sparsefuncs_fast import csr_mean_variance_axis0 +from ..utils.sparsefuncs import mean_variance_axis class VarianceThreshold(BaseEstimator, SelectorMixin): @@ -58,10 +58,10 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, 'csr', dtype=np.float64) + X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix - _, self.variances_ = csr_mean_variance_axis0(X) + _, self.variances_ = mean_variance_axis(X, axis=0) else: self.variances_ = np.var(X, axis=0) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index fd82c9684e2f0..677f900a9daba 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -27,7 +27,7 @@ from ..base import BaseEstimator, ClassifierMixin, RegressorMixin from ..utils import as_float_array, check_array from ..utils.extmath import safe_sparse_dot -from ..utils.sparsefuncs import mean_variance_axis0, inplace_column_scale +from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale ### @@ -48,14 +48,14 @@ def sparse_center_data(X, y, fit_intercept, normalize=False): if fit_intercept: # we might require not to change the csr matrix sometimes # store a copy if normalize is True. - # Change dtype to float64 since mean_variance_axis0 accepts + # Change dtype to float64 since mean_variance_axis accepts # it that way. if sp.isspmatrix(X) and X.getformat() == 'csr': X = sp.csr_matrix(X, copy=normalize, dtype=np.float64) else: X = sp.csc_matrix(X, copy=normalize, dtype=np.float64) - X_mean, X_var = mean_variance_axis0(X) + X_mean, X_var = mean_variance_axis(X, axis=0) if normalize: # transform variance to std in-place # XXX: currently scaled to variance=n_samples to match center_data diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e3c2ef34b5885..d1b8d89b64ec5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -16,10 +16,9 @@ from ..utils import warn_if_not_float from ..utils.extmath import row_norms from ..utils.fixes import combinations_with_replacement as combinations_w_r -from ..utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 -from ..utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 -from ..utils.sparsefuncs import inplace_column_scale -from ..utils.sparsefuncs import mean_variance_axis0 +from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, + inplace_csr_row_normalize_l2) +from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis) zip = six.moves.zip map = six.moves.map @@ -124,7 +123,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): copy = False if copy: X = X.copy() - _, var = mean_variance_axis0(X) + _, var = mean_variance_axis(X, axis=0) var[var == 0.0] = 1.0 inplace_column_scale(X, 1 / np.sqrt(var)) else: @@ -319,7 +318,7 @@ def fit(self, X, y=None): self.mean_ = None if self.with_std: - var = mean_variance_axis0(X)[1] + var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 55e85a86dce8c..879b3b20aff85 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -14,7 +14,7 @@ from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_warns -from sklearn.utils.sparsefuncs import mean_variance_axis0 +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.preprocessing.data import _transform_selected from sklearn.preprocessing.data import Binarizer from sklearn.preprocessing.data import KernelCenterer @@ -283,7 +283,7 @@ def test_scaler_without_centering(): X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) @@ -349,8 +349,8 @@ def test_scaler_int(): [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) - X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0( - X_csr_scaled.astype(np.float)) + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis( + X_csr_scaled.astype(np.float), 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) @@ -432,7 +432,7 @@ def test_scale_function_without_centering(): # Check that X has not been copied assert_true(X_scaled is not X) - X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 8d71e9216e73a..e17cf95b6a535 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -6,8 +6,8 @@ import numpy as np from .fixes import sparse_min_max -from .sparsefuncs_fast import (csr_mean_variance_axis0, - csc_mean_variance_axis0) +from .sparsefuncs_fast import csr_mean_variance_axis0 as _csr_mean_var_axis0 +from .sparsefuncs_fast import csc_mean_variance_axis0 as _csc_mean_var_axis0 def _raise_typeerror(X): @@ -53,7 +53,7 @@ def inplace_csr_row_scale(X, scale): X.data *= np.repeat(scale, np.diff(X.indptr)) -def mean_variance_axis0(X): +def mean_variance_axis(X, axis): """Compute mean and variance along axis 0 on a CSR or CSC matrix Parameters @@ -61,6 +61,9 @@ def mean_variance_axis0(X): X: CSR or CSC sparse matrix, shape (n_samples, n_features) Input data. + axis: int (either 0 or 1) + Axis along which the axis should be computed. + Returns ------- @@ -71,10 +74,20 @@ def mean_variance_axis0(X): Feature-wise variances """ + if axis not in (0, 1): + raise ValueError( + "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis) + if isinstance(X, sp.csr_matrix): - return csr_mean_variance_axis0(X) + if axis == 0: + return _csr_mean_var_axis0(X) + else: + return _csc_mean_var_axis0(X.T) elif isinstance(X, sp.csc_matrix): - return csc_mean_variance_axis0(X) + if axis == 0: + return _csc_mean_var_axis0(X) + else: + return _csr_mean_var_axis0(X.T) else: _raise_typeerror(X) @@ -258,13 +271,16 @@ def inplace_swap_column(X, m, n): def min_max_axis(X, axis): - """Compute minimum and maximum along axis 0 on a CSR or CSC matrix + """Compute minimum and maximum along an axis on a CSR or CSC matrix Parameters ---------- X : CSR or CSC sparse matrix, shape (n_samples, n_features) Input data. + axis: int (either 0 or 1) + Axis along which the axis should be computed. + Returns ------- diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 349a1cf412eb1..e7677994ce922 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -5,7 +5,7 @@ from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn.datasets import make_classification -from sklearn.utils.sparsefuncs import (mean_variance_axis0, +from sklearn.utils.sparsefuncs import (mean_variance_axis, inplace_column_scale, inplace_row_scale, inplace_swap_row, inplace_swap_column, @@ -26,27 +26,72 @@ def test_mean_variance_axis0(): X[1, 0] = 0 X_csr = sp.csr_matrix(X_lil) - X_means, X_vars = mean_variance_axis0(X_csr) + X_means, X_vars = mean_variance_axis(X_csr, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) X_csc = sp.csc_matrix(X_lil) - X_means, X_vars = mean_variance_axis0(X_csc) + X_means, X_vars = mean_variance_axis(X_csc, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) - assert_raises(TypeError, mean_variance_axis0, X_lil) + assert_raises(TypeError, mean_variance_axis, X_lil, axis=0) X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_csc = X_csr.astype(np.float32) - X_means, X_vars = mean_variance_axis0(X_csr) + X_means, X_vars = mean_variance_axis(X_csr, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) - X_means, X_vars = mean_variance_axis0(X_csc) + X_means, X_vars = mean_variance_axis(X_csc, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) - assert_raises(TypeError, mean_variance_axis0, X_lil) + assert_raises(TypeError, mean_variance_axis, X_lil, axis=0) + + +def test_mean_variance_illegal_axis(): + X, _ = make_classification(5, 4, random_state=0) + # Sparsify the array a little bit + X[0, 0] = 0 + X[2, 1] = 0 + X[4, 3] = 0 + X_csr = sp.csr_matrix(X) + assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3) + assert_raises(ValueError, mean_variance_axis, X_csr, axis=2) + assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1) + +def test_mean_variance_axis1(): + X, _ = make_classification(5, 4, random_state=0) + # Sparsify the array a little bit + X[0, 0] = 0 + X[2, 1] = 0 + X[4, 3] = 0 + X_lil = sp.lil_matrix(X) + X_lil[1, 0] = 0 + X[1, 0] = 0 + X_csr = sp.csr_matrix(X_lil) + + X_means, X_vars = mean_variance_axis(X_csr, axis=1) + assert_array_almost_equal(X_means, np.mean(X, axis=1)) + assert_array_almost_equal(X_vars, np.var(X, axis=1)) + + X_csc = sp.csc_matrix(X_lil) + X_means, X_vars = mean_variance_axis(X_csc, axis=1) + + assert_array_almost_equal(X_means, np.mean(X, axis=1)) + assert_array_almost_equal(X_vars, np.var(X, axis=1)) + assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) + + X = X.astype(np.float32) + X_csr = X_csr.astype(np.float32) + X_csc = X_csr.astype(np.float32) + X_means, X_vars = mean_variance_axis(X_csr, axis=1) + assert_array_almost_equal(X_means, np.mean(X, axis=1)) + assert_array_almost_equal(X_vars, np.var(X, axis=1)) + X_means, X_vars = mean_variance_axis(X_csc, axis=1) + assert_array_almost_equal(X_means, np.mean(X, axis=1)) + assert_array_almost_equal(X_vars, np.var(X, axis=1)) + assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) def test_densify_rows():