From 9e710ed8437908c3eaaf6e68f5523842a846c6ea Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Thu, 23 Apr 2015 12:58:17 -0400 Subject: [PATCH 1/4] WIP: adding 'max' normalizer to normalize() This still needs tests and doc updates. --- sklearn/preprocessing/data.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index c97d0ba3e5aed..88a0452f2dc4a 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -570,7 +570,7 @@ def normalize(X, norm='l2', axis=1, copy=True): scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. - norm : 'l1' or 'l2', optional ('l2' by default) + norm : 'l1', 'l2', or 'max', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). @@ -589,7 +589,7 @@ def normalize(X, norm='l2', axis=1, copy=True): using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ - if norm not in ('l1', 'l2'): + if norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: @@ -609,13 +609,19 @@ def normalize(X, norm='l2', axis=1, copy=True): inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) + elif norm == 'max': + norms = X.max(axis=1).toarray() + norms = norms.repeat(np.diff(X.indptr)) + mask = norms != 0 + X.data[mask] /= norms[mask] else: if norm == 'l1': norms = np.abs(X).sum(axis=1) - norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) - norms[norms == 0.0] = 1.0 + elif norm == 'max': + norms = np.max(X, axis=1) + norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: From 007ae76cb358fe7836b8a37618062bdfa5629c16 Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Fri, 8 May 2015 14:20:54 -0400 Subject: [PATCH 2/4] TST: covering norm='max' branches of normalize() --- sklearn/preprocessing/tests/test_data.py | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 6e565010d2ada..fe536517837d3 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -604,6 +604,55 @@ def test_normalizer_l2(): assert_almost_equal(la.norm(X_norm[3]), 0.0) +def test_normalizer_max(): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + X_sparse_unpruned = sparse.csr_matrix(X_dense) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + # set the row number 3 to zero without pruning (can happen in real life) + indptr_3 = X_sparse_unpruned.indptr[3] + indptr_4 = X_sparse_unpruned.indptr[4] + X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 + + # build the pruned variant using the regular constructor + X_sparse_pruned = sparse.csr_matrix(X_dense) + + # check inputs that support the no-copy optim + for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): + + normalizer = Normalizer(norm='max', copy=True) + X_norm1 = normalizer.transform(X) + assert_true(X_norm1 is not X) + X_norm1 = toarray(X_norm1) + + normalizer = Normalizer(norm='max', copy=False) + X_norm2 = normalizer.transform(X) + assert_true(X_norm2 is X) + X_norm2 = toarray(X_norm2) + + for X_norm in (X_norm1, X_norm2): + row_maxs = X_norm.max(axis=1) + for i in range(3): + assert_almost_equal(row_maxs[i], 1.0) + assert_almost_equal(row_maxs[3], 0.0) + + # check input for which copy=False won't prevent a copy + for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): + X = init(X_dense) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + + assert_true(X_norm is not X) + assert_true(isinstance(X_norm, sparse.csr_matrix)) + + X_norm = toarray(X_norm) + for i in range(3): + assert_almost_equal(row_maxs[i], 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + + def test_normalize(): # Test normalize function # Only tests functionality not used by the tests for Normalizer. From 84ee88db791c962d1cb4d3b99cf41b84a35856b6 Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Fri, 8 May 2015 14:22:55 -0400 Subject: [PATCH 3/4] DOC: updating Normalizer docstring for norm='max' --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 88a0452f2dc4a..d619c26ca85d4 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -649,7 +649,7 @@ class Normalizer(BaseEstimator, TransformerMixin): Parameters ---------- - norm : 'l1' or 'l2', optional ('l2' by default) + norm : 'l1', 'l2', or 'max', optional ('l2' by default) The norm to use to normalize each non zero sample. copy : boolean, optional, default True From 5fcad7cc673b6dfbfbacfffe7bf8dbb2ac6e672f Mon Sep 17 00:00:00 2001 From: CJ Carey Date: Fri, 8 May 2015 15:13:27 -0400 Subject: [PATCH 4/4] Fixing sparse max for older scipy --- sklearn/preprocessing/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d619c26ca85d4..fa268280e1241 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -22,7 +22,8 @@ from ..utils.fixes import isclose from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) -from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis) +from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis, + min_max_axis) from ..utils.validation import check_is_fitted zip = six.moves.zip @@ -610,7 +611,7 @@ def normalize(X, norm='l2', axis=1, copy=True): elif norm == 'l2': inplace_csr_row_normalize_l2(X) elif norm == 'max': - norms = X.max(axis=1).toarray() + _, norms = min_max_axis(X, 1) norms = norms.repeat(np.diff(X.indptr)) mask = norms != 0 X.data[mask] /= norms[mask]