Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[WIP] Disregard nan fix #10457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
30 changes: 21 additions & 9 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from ..utils.extmath import _incremental_mean_and_var
from ..utils.fixes import _argmax
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
inplace_csr_row_normalize_l2,
n_samples_count_csc, n_samples_count_csr)
from ..utils.sparsefuncs import (inplace_column_scale,
mean_variance_axis, incr_mean_variance_axis,
min_max_axis)
Expand Down Expand Up @@ -619,7 +620,8 @@ def partial_fit(self, X, y=None):
Ignored
"""
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
warn_on_dtype=True, estimator=self,
force_all_finite='allow-nan', dtype=FLOAT_DTYPES)

# Even in the case of `with_mean=False`, we update the mean anyway
# This is needed for the incremental computation of the var
Expand All @@ -634,22 +636,31 @@ def partial_fit(self, X, y=None):
# First pass
if not hasattr(self, 'n_samples_seen_'):
self.mean_, self.var_ = mean_variance_axis(X, axis=0)
self.n_samples_seen_ = X.shape[0]
if isinstance(X, sparse.csc_matrix):
self.n_samples_seen_ = \
n_samples_count_csc(X.data, X.shape,
X.indices, X.indptr)
else:
self.n_samples_seen_ = \
n_samples_count_csr(X.data, X.shape, X.indices)

# Next passes
else:
self.mean_, self.var_, self.n_samples_seen_ = \
incr_mean_variance_axis(X, axis=0,
last_mean=self.mean_,
last_var=self.var_,
last_n=self.n_samples_seen_)
incr_mean_variance_axis(
X, axis=0,
last_mean=self.mean_,
last_var=self.var_,
last_n=0,
last_n_feat=self.n_samples_seen_)
else:
self.mean_ = None
self.var_ = None
else:
# First pass
if not hasattr(self, 'n_samples_seen_'):
self.mean_ = .0
self.n_samples_seen_ = 0
self.n_samples_seen_ = np.zeros(X.shape[1])
if self.with_std:
self.var_ = .0
else:
Expand Down Expand Up @@ -688,7 +699,8 @@ def transform(self, X, y='deprecated', copy=None):

copy = copy if copy is not None else self.copy
X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True,
estimator=self, dtype=FLOAT_DTYPES)
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')

if sparse.issparse(X):
if self.with_mean:
Expand Down
12 changes: 6 additions & 6 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def test_standard_scaler_1d():
np.zeros_like(n_features))
assert_array_almost_equal(X_scaled.mean(axis=0), .0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
assert_equal(scaler.n_samples_seen_[0], X.shape[0])

# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_scaler_2d_arrays():
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
assert_equal(scaler.n_samples_seen_, n_samples)
assert_equal(scaler.n_samples_seen_[0], n_samples)

assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
Expand Down Expand Up @@ -399,7 +399,7 @@ def test_standard_scaler_partial_fit():

assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_equal(scaler_batch.n_samples_seen_[0], scaler_incr.n_samples_seen_[0])

# Test std after 1 step
batch0 = slice(0, chunk_size)
Expand All @@ -423,10 +423,10 @@ def test_standard_scaler_partial_fit():
assert_correct_incr(i, batch_start=batch.start,
batch_stop=batch.stop, n=n,
chunk_size=chunk_size,
n_samples_seen=scaler_incr.n_samples_seen_)
n_samples_seen=scaler_incr.n_samples_seen_[0])

assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_equal(scaler_batch.n_samples_seen_[0], scaler_incr.n_samples_seen_[0])


def test_standard_scaler_partial_fit_numerical_stability():
Expand Down Expand Up @@ -515,7 +515,7 @@ def test_standard_scaler_trasform_with_partial_fit():
assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal
assert_array_less(zero, scaler_incr.scale_ + epsilon)
# (i+1) because the Scaler has been already fitted
assert_equal((i + 1), scaler_incr.n_samples_seen_)
assert_equal((i + 1), scaler_incr.n_samples_seen_[0])


def test_min_max_scaler_iris():
Expand Down
3 changes: 3 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
'RANSACRegressor', 'RadiusNeighborsRegressor',
'RandomForestRegressor', 'Ridge', 'RidgeCV']
ALLOW_NAN = ['StandardScaler']


def _yield_non_meta_checks(name, estimator):
Expand Down Expand Up @@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig):
error_string_transform = ("Estimator doesn't check for NaN and inf in"
" transform.")
for X_train in [X_train_nan, X_train_inf]:
if np.any(np.isnan(X_train)) and name in ALLOW_NAN:
continue
# catch deprecation warnings
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
estimator = clone(estimator_orig)
Expand Down
56 changes: 41 additions & 15 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@ def make_nonnegative(X, min_value=0):


def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
last_sample_count=0):
last_sample_count=0, ignore_nan=True):
"""Calculate mean update and a Youngs and Cramer variance update.

last_mean and last_variance are statistics computed at the last step by the
Expand Down Expand Up @@ -688,29 +688,55 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
# old = stats until now
# new = the current increment
# updated = the aggregated stats
flag = 0 # if flag == 1 then last_sample_count was an array
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a bad variable name

n_features = X.shape[1]
if isinstance(last_sample_count, np.ndarray):
flag = 1
else:
last_sample_count *= np.ones(n_features)

last_sum = last_mean * last_sample_count
new_sum = X.sum(axis=0)
sum_func = np.nansum if ignore_nan else np.sum
new_sum = sum_func(X, axis=0)
new_sum[np.isnan(new_sum)] = 0

new_sample_count = X.shape[0]
new_sample_count = np.count_nonzero(~np.isnan(X), axis=0)
if not isinstance(new_sample_count, np.ndarray):
new_sample_count *= np.ones(n_features)
updated_sample_count = last_sample_count + new_sample_count

updated_mean = (last_sum + new_sum) / updated_sample_count
updated_variance = np.zeros(n_features)

if last_variance is None:
updated_variance = None
else:
new_unnormalized_variance = X.var(axis=0) * new_sample_count
if last_sample_count == 0: # Avoid division by 0
updated_unnormalized_variance = new_unnormalized_variance
else:
last_over_new_count = last_sample_count / new_sample_count
last_unnormalized_variance = last_variance * last_sample_count
updated_unnormalized_variance = (
last_unnormalized_variance +
new_unnormalized_variance +
last_over_new_count / updated_sample_count *
(last_sum / last_over_new_count - new_sum) ** 2)
updated_variance = updated_unnormalized_variance / updated_sample_count
var_func = np.nanvar if ignore_nan else np.var
new_unnormalized_variance = var_func(X, axis=0)
new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0
new_unnormalized_variance = (new_unnormalized_variance *
new_sample_count)
for i in xrange(n_features):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect you can do everything here with vectorized numpy operations and should avoid an explicit loop over each feature.

if updated_sample_count[i] == 0: # Avoid division by 0
continue
# Avoid division by 0
elif last_sample_count[i] == 0 or new_sample_count[i] == 0:
updated_unnormalized_variance = new_unnormalized_variance[i]
else:
last_over_new_count = (last_sample_count[i] /
new_sample_count[i])
last_unnormalized_variance = (last_variance[i] *
last_sample_count[i])
updated_unnormalized_variance = (
last_unnormalized_variance +
new_unnormalized_variance[i] +
last_over_new_count / updated_sample_count[i] *
(last_sum[i] / last_over_new_count - new_sum[i]) ** 2)
updated_variance[i] = (updated_unnormalized_variance /
updated_sample_count[i])

if flag == 0: # If n_sample_count was not an array
updated_sample_count = updated_sample_count[0]

return updated_mean, updated_variance, updated_sample_count

Expand Down
15 changes: 10 additions & 5 deletions sklearn/utils/sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def mean_variance_axis(X, axis):
_raise_typeerror(X)


def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n,
last_n_feat=np.array([0], dtype=np.uint32)):
"""Compute incremental mean and variance along an axix on a CSR or
CSC matrix.

Expand Down Expand Up @@ -143,17 +144,21 @@ def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
if isinstance(X, sp.csr_matrix):
if axis == 0:
return _incr_mean_var_axis0(X, last_mean=last_mean,
last_var=last_var, last_n=last_n)
last_var=last_var, last_n=last_n,
last_n_feat=last_n_feat)
else:
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
last_var=last_var, last_n=last_n)
last_var=last_var, last_n=last_n,
last_n_feat=last_n_feat)
elif isinstance(X, sp.csc_matrix):
if axis == 0:
return _incr_mean_var_axis0(X, last_mean=last_mean,
last_var=last_var, last_n=last_n)
last_var=last_var, last_n=last_n,
last_n_feat=last_n_feat)
else:
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
last_var=last_var, last_n=last_n)
last_var=last_var, last_n=last_n,
last_n_feat=last_n_feat)
else:
_raise_typeerror(X)

Expand Down
Loading