Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fa59873

Browse files
yangarbiterGaelVaroquaux
authored andcommitted
[MRG+1] FIX unstable cumsum (scikit-learn#7376)
* FIX unstable cumsum in utils.random * equal_nan = true for isclose since numpy < 1.9 sum is as unstable as cumsum, fallback to np.cumsum * added axis parameter to stable_cumsum * FIX unstable sumsum in ensemble.weight_boosting and utils.stats * FIX axis problem in stable_cumsum * FIX unstable cumsum in mixture.gmm and mixture.dpgmm * FIX unstable cumsum in cluster.k_means_, decomposition.pca, and manifold.locally_linear * FIX unstable sumsum in dataset.samples_generator * added docstring for parameter axis of stable_cumsum * added comment for why fall back to np.cumsum when np version < 1.9 * remove unneeded stable_cumsum * added stable_cumsum's axis testing * FIX numpy docstring for make_sparse_spd_matrix * change stable_cumsum from error to warning
1 parent 65a4de5 commit fa59873

File tree

9 files changed

+40
-21
lines changed

9 files changed

+40
-21
lines changed

sklearn/cluster/k_means_.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from ..base import BaseEstimator, ClusterMixin, TransformerMixin
2020
from ..metrics.pairwise import euclidean_distances
21-
from ..utils.extmath import row_norms, squared_norm
21+
from ..utils.extmath import row_norms, squared_norm, stable_cumsum
2222
from ..utils.sparsefuncs_fast import assign_rows_csr
2323
from ..utils.sparsefuncs import mean_variance_axis
2424
from ..utils.fixes import astype
@@ -106,7 +106,8 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
106106
# Choose center candidates by sampling with probability proportional
107107
# to the squared distance to the closest existing center
108108
rand_vals = random_state.random_sample(n_local_trials) * current_pot
109-
candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)
109+
candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
110+
rand_vals)
110111

111112
# Compute distances to center candidates
112113
distance_to_candidates = euclidean_distances(

sklearn/datasets/samples_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1194,7 +1194,7 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
11941194
The size of the random matrix to generate.
11951195
11961196
alpha : float between 0 and 1, optional (default=0.95)
1197-
The probability that a coefficient is zero (see notes). Larger values
1197+
The probability that a coefficient is zero (see notes). Larger values
11981198
enforce more sparsity.
11991199
12001200
random_state : int, RandomState instance or None, optional (default=None)

sklearn/decomposition/pca.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from ..utils import check_random_state, as_float_array
2525
from ..utils import check_array
2626
from ..utils.extmath import fast_dot, fast_logdet, randomized_svd, svd_flip
27+
from ..utils.extmath import stable_cumsum
2728
from ..utils.validation import check_is_fitted
2829
from ..utils.arpack import svds
2930

@@ -393,7 +394,7 @@ def _fit_full(self, X, n_components):
393394
elif 0 < n_components < 1.0:
394395
# number of components for which the cumulated explained
395396
# variance percentage is superior to the desired threshold
396-
ratio_cumsum = explained_variance_ratio_.cumsum()
397+
ratio_cumsum = stable_cumsum(explained_variance_ratio_)
397398
n_components = np.searchsorted(ratio_cumsum, n_components) + 1
398399

399400
# Compute noise covariance using Probabilistic PCA model

sklearn/ensemble/weight_boosting.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from ..tree.tree import BaseDecisionTree
3939
from ..tree._tree import DTYPE
4040
from ..utils import check_array, check_X_y, check_random_state
41+
from ..utils.extmath import stable_cumsum
4142
from ..metrics import accuracy_score, r2_score
4243
from sklearn.utils.validation import has_fit_parameter, check_is_fitted
4344

@@ -1002,7 +1003,7 @@ def _boost(self, iboost, X, y, sample_weight, random_state):
10021003

10031004
# Weighted sampling of the training set with replacement
10041005
# For NumPy >= 1.7.0 use np.random.choice
1005-
cdf = sample_weight.cumsum()
1006+
cdf = stable_cumsum(sample_weight)
10061007
cdf /= cdf[-1]
10071008
uniform_samples = random_state.random_sample(X.shape[0])
10081009
bootstrap_idx = cdf.searchsorted(uniform_samples, side='right')
@@ -1059,7 +1060,7 @@ def _get_median_predict(self, X, limit):
10591060
sorted_idx = np.argsort(predictions, axis=1)
10601061

10611062
# Find index of median prediction for each sample
1062-
weight_cdf = self.estimator_weights_[sorted_idx].cumsum(axis=1)
1063+
weight_cdf = stable_cumsum(self.estimator_weights_[sorted_idx], axis=1)
10631064
median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
10641065
median_idx = median_or_above.argmax(axis=1)
10651066

sklearn/manifold/locally_linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ..base import BaseEstimator, TransformerMixin
1111
from ..utils import check_random_state, check_array
1212
from ..utils.arpack import eigsh
13+
from ..utils.extmath import stable_cumsum
1314
from ..utils.validation import check_is_fitted
1415
from ..utils.validation import FLOAT_DTYPES
1516
from ..neighbors import NearestNeighbors
@@ -420,7 +421,7 @@ def locally_linear_embedding(
420421
# this is the size of the largest set of eigenvalues
421422
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
422423
s_range = np.zeros(N, dtype=int)
423-
evals_cumsum = np.cumsum(evals, 1)
424+
evals_cumsum = stable_cumsum(evals, 1)
424425
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
425426
for i in range(N):
426427
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)

sklearn/mixture/dpgmm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from ..externals.six.moves import xrange
2626
from ..utils import check_random_state, check_array, deprecated
27-
from ..utils.extmath import logsumexp, pinvh, squared_norm
27+
from ..utils.extmath import logsumexp, pinvh, squared_norm, stable_cumsum
2828
from ..utils.validation import check_is_fitted
2929
from .. import cluster
3030
from .gmm import _GMMBase
@@ -462,7 +462,7 @@ def _bound_proportions(self, z):
462462
dg1 = digamma(self.gamma_.T[1]) - dg12
463463
dg2 = digamma(self.gamma_.T[2]) - dg12
464464

465-
cz = np.cumsum(z[:, ::-1], axis=-1)[:, -2::-1]
465+
cz = stable_cumsum(z[:, ::-1], axis=-1)[:, -2::-1]
466466
logprior = np.sum(cz * dg2[:-1]) + np.sum(z * dg1)
467467
del cz # Save memory
468468
z_non_zeros = z[z > np.finfo(np.float32).eps]

sklearn/utils/extmath.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from ..externals.six.moves import xrange
2626
from .sparsefuncs_fast import csr_row_norms
2727
from .validation import check_array
28-
from ..exceptions import NonBLASDotWarning
28+
from ..exceptions import ConvergenceWarning, NonBLASDotWarning
2929

3030

3131
def norm(x):
@@ -844,21 +844,30 @@ def _deterministic_vector_sign_flip(u):
844844
return u
845845

846846

847-
def stable_cumsum(arr, rtol=1e-05, atol=1e-08):
847+
def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
848848
"""Use high precision for cumsum and check that final value matches sum
849849
850850
Parameters
851851
----------
852852
arr : array-like
853853
To be cumulatively summed as flat
854+
axis : int, optional
855+
Axis along which the cumulative sum is computed.
856+
The default (None) is to compute the cumsum over the flattened array.
854857
rtol : float
855858
Relative tolerance, see ``np.allclose``
856859
atol : float
857860
Absolute tolerance, see ``np.allclose``
858861
"""
859-
out = np.cumsum(arr, dtype=np.float64)
860-
expected = np.sum(arr, dtype=np.float64)
861-
if not np.allclose(out[-1], expected, rtol=rtol, atol=atol):
862-
raise RuntimeError('cumsum was found to be unstable: '
863-
'its last element does not correspond to sum')
862+
# sum is as unstable as cumsum for numpy < 1.9
863+
if np_version < (1, 9):
864+
return np.cumsum(arr, axis=axis, dtype=np.float64)
865+
866+
out = np.cumsum(arr, axis=axis, dtype=np.float64)
867+
expected = np.sum(arr, axis=axis, dtype=np.float64)
868+
if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol,
869+
atol=atol, equal_nan=True)):
870+
warnings.warn('cumsum was found to be unstable: '
871+
'its last element does not correspond to sum',
872+
ConvergenceWarning)
864873
return out

sklearn/utils/stats.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
from scipy.stats import rankdata as _sp_rankdata
33
from .fixes import bincount
4+
from ..utils.extmath import stable_cumsum
45

56

67
# To remove when we support scipy 0.13
@@ -53,7 +54,7 @@ def _weighted_percentile(array, sample_weight, percentile=50):
5354
sorted_idx = np.argsort(array)
5455

5556
# Find index of median prediction for each sample
56-
weight_cdf = sample_weight[sorted_idx].cumsum()
57+
weight_cdf = stable_cumsum(sample_weight[sorted_idx])
5758
percentile_idx = np.searchsorted(
5859
weight_cdf, (percentile / 100.) * weight_cdf[-1])
5960
return array[sorted_idx[percentile_idx]]

sklearn/utils/tests/test_extmath.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sklearn.utils.testing import assert_greater
1919
from sklearn.utils.testing import assert_raises
2020
from sklearn.utils.testing import assert_raise_message
21+
from sklearn.utils.testing import assert_warns
2122
from sklearn.utils.testing import skip_if_32bit
2223
from sklearn.utils.testing import SkipTest
2324
from sklearn.utils.fixes import np_version
@@ -36,6 +37,7 @@
3637
from sklearn.utils.extmath import _deterministic_vector_sign_flip
3738
from sklearn.utils.extmath import softmax
3839
from sklearn.utils.extmath import stable_cumsum
40+
from sklearn.exceptions import ConvergenceWarning
3941
from sklearn.datasets.samples_generator import make_low_rank_matrix
4042

4143

@@ -654,7 +656,10 @@ def test_stable_cumsum():
654656
raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9")
655657
assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
656658
r = np.random.RandomState(0).rand(100000)
657-
assert_raise_message(RuntimeError,
658-
'cumsum was found to be unstable: its last element '
659-
'does not correspond to sum',
660-
stable_cumsum, r, rtol=0, atol=0)
659+
assert_warns(ConvergenceWarning, stable_cumsum, r, rtol=0, atol=0)
660+
661+
# test axis parameter
662+
A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
663+
assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
664+
assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
665+
assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))

0 commit comments

Comments
 (0)