From dc67ec0a0b14d40f54abd386e3fc5a6aee092dee Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 3 Sep 2018 13:23:20 +1000 Subject: [PATCH 01/20] FEA Reinstate ChainedImputer This reverts commit f819704880b5d6affb49996f832f1aa5c8799571. --- doc/modules/classes.rst | 3 +- doc/modules/impute.rst | 71 +++- doc/whats_new/v0.20.rst | 1 - doc/whats_new/v0.21.rst | 8 + examples/plot_missing_values.py | 28 +- sklearn/impute.py | 558 +++++++++++++++++++++++++++++- sklearn/tests/test_impute.py | 233 ++++++++++++- sklearn/utils/estimator_checks.py | 2 +- 8 files changed, 887 insertions(+), 17 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 57ccfb5cff704..2d3174a9dcc05 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -656,8 +656,9 @@ Kernels: :template: class.rst impute.SimpleImputer + impute.ChainedImputer impute.MissingIndicator - + .. _kernel_approximation_ref: :mod:`sklearn.kernel_approximation` Kernel Approximation diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 0fd119857177b..268ce1c3ede19 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -16,6 +16,22 @@ values. However, this comes at the price of losing data which may be valuable i.e., to infer them from the known part of the data. See the :ref:`glossary` entry on imputation. + +Univariate vs. Multivariate Imputation +====================================== + +One type of imputation algorithm is univariate, which imputes values in the i-th +feature dimension using only non-missing values in that feature dimension +(e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation +algorithms use the entire set of available feature dimensions to estimate the +missing values (e.g. :class:`impute.ChainedImputer`). + + +.. _single_imputer: + +Univariate feature imputation +============================= + The :class:`SimpleImputer` class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) of each column in which the @@ -71,9 +87,60 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] +.. _chained_imputer: + + +Multivariate feature imputation +=============================== -:class:`SimpleImputer` can be used in a Pipeline as a way to build a composite -estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +A more sophisticated approach is to use the :class:`ChainedImputer` class, which +implements the imputation technique from MICE (Multivariate Imputation by +Chained Equations). MICE models each feature with missing values as a function of +other features, and uses that estimate for imputation. It does so in a round-robin +fashion: at each step, a feature column is designated as output `y` and the other +feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`. +Then, the regressor is used to predict the unknown values of `y`. This is repeated +for each feature in a chained fashion, and then is done for a number of imputation +rounds. Here is an example snippet:: + + >>> import numpy as np + >>> from sklearn.impute import ChainedImputer + >>> imp = ChainedImputer(n_imputations=10, random_state=0) + >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) + ChainedImputer(imputation_order='ascending', initial_strategy='mean', + max_value=None, min_value=None, missing_values=nan, n_burn_in=10, + n_imputations=10, n_nearest_features=None, predictor=None, + random_state=0, verbose=False) + >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] + >>> print(np.round(imp.transform(X_test))) + [[ 1. 2.] + [ 6. 4.] + [13. 6.]] + +Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline +as a way to build a composite estimator that supports imputation. +See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. + +.. _multiple_imputation: + +Multiple vs. Single Imputation +============================== + +In the statistics community, it is common practice to perform multiple imputations, +generating, for example, 10 separate imputations for a single feature matrix. +Each of these 10 imputations is then put through the subsequent analysis pipeline +(e.g. feature engineering, clustering, regression, classification). The 10 final +analysis results (e.g. held-out validation error) allow the data scientist to +obtain understanding of the uncertainty inherent in the missing values. The above +practice is called multiple imputation. As implemented, the :class:`ChainedImputer` +class generates a single (averaged) imputation for each missing value because this +is the most common use case for machine learning applications. However, it can also be used +for multiple imputations by applying it repeatedly to the same dataset with different +random seeds with the ``n_imputations`` parameter set to 1. + +Note that a call to the ``transform`` method of :class:`ChainedImputer` is not +allowed to change the number of samples. Therefore multiple imputations cannot be +achieved by a single call to ``transform``. .. _missing_indicator: diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 2ed336b782174..402b7c178c8dd 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -155,7 +155,6 @@ Support for Python 3.3 has been officially dropped. :class:`cluster.AgglomerativeClustering`. :issue:`9875` by :user:`Kumar Ashutosh `. - :mod:`sklearn.compose` ...................... diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 202972f0575c0..2c010e5b1be59 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -40,6 +40,14 @@ Support for Python 3.4 and below has been officially dropped. - An entry goes here - An entry goes here +:mod:`sklearn.impute` +..................... + +- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for + imputing missing values by modeling each feature with missing values as a + function of other features in a round-robin fashion. :issue:`8478` by + :user:`Sergey Feldman `. + Multiple modules ................ diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 755943fb55bda..3ab1cfff95576 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -3,21 +3,22 @@ Imputing missing values before building an estimator ==================================================== -This example shows that imputing the missing values can give better -results than discarding the samples containing any missing value. -Imputing does not always improve the predictions, so please check via -cross-validation. Sometimes dropping rows or using marker values is -more effective. - Missing values can be replaced by the mean, the median or the most frequent value using the basic :func:`sklearn.impute.SimpleImputer`. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). +Another option is the :func:`sklearn.impute.ChainedImputer`. This uses +round-robin linear regression, treating every variable as an output in +turn. The version implemented assumes Gaussian (output) variables. If your +features are obviously non-Normal, consider transforming them to look more +Normal so as to improve performance. + In addition of using an imputing method, we can also keep an indication of the missing information using :func:`sklearn.impute.MissingIndicator` which might carry some information. """ + import numpy as np import matplotlib.pyplot as plt @@ -25,7 +26,7 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import SimpleImputer, MissingIndicator +from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) @@ -72,10 +73,18 @@ def get_results(dataset): scoring='neg_mean_squared_error', cv=5) + # Estimate the score after chained imputation of the missing values + estimator = make_pipeline( + make_union(ChainedImputer(missing_values=0, random_state=0), + MissingIndicator(missing_values=0)), + RandomForestRegressor(random_state=0, n_estimators=100)) + chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std())) + (mean_impute_scores.mean(), mean_impute_scores.std()), + (chained_impute_scores.mean(), chained_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) @@ -91,7 +100,8 @@ def get_results(dataset): x_labels = ['Full data', 'Zero imputation', - 'Mean Imputation'] + 'Mean Imputation', + 'Chained Imputation'] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results diff --git a/sklearn/impute.py b/sklearn/impute.py index e98c425d1b34f..b9afbc76bd2d8 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -3,16 +3,22 @@ # Sergey Feldman # License: BSD 3 clause +from __future__ import division + import warnings import numbers +import time import numpy as np import numpy.ma as ma from scipy import sparse from scipy import stats +from collections import namedtuple from .base import BaseEstimator, TransformerMixin -from .utils import check_array +from .base import clone +from .preprocessing import normalize +from .utils import check_array, check_random_state, safe_indexing from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES @@ -24,9 +30,14 @@ zip = six.moves.zip map = six.moves.map +ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', + 'neighbor_feat_idx', + 'predictor']) + __all__ = [ 'MissingIndicator', 'SimpleImputer', + 'ChainedImputer', ] @@ -409,6 +420,551 @@ def transform(self, X): return X +class ChainedImputer(BaseEstimator, TransformerMixin): + """Chained imputer transformer to impute missing values. + + Basic implementation of chained imputer from MICE (Multivariate + Imputations by Chained Equations) package from R. This version assumes all + of the features are Gaussian. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : int, np.nan, optional (default=np.nan) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be imputed. + + imputation_order : str, optional (default="ascending") + The order in which the features will be imputed. Possible values: + + "ascending" + From features with fewest missing values to most. + "descending" + From features with most missing values to fewest. + "roman" + Left to right. + "arabic" + Right to left. + "random" + A random order for each round. + + n_imputations : int, optional (default=100) + Number of chained imputation rounds to perform, the results of which + will be used in the final average. + + n_burn_in : int, optional (default=10) + Number of initial imputation rounds to perform the results of which + will not be returned. + + predictor : estimator object, default=BayesianRidge() + The predictor to use at each step of the round-robin imputation. + It must support ``return_std`` in its ``predict`` method. + + n_nearest_features : int, optional (default=None) + Number of other features to use to estimate the missing values of + the each feature column. Nearness between features is measured using + the absolute correlation coefficient between each feature pair (after + initial imputation). Can provide significant speed-up when the number + of features is huge. If ``None``, all features will be used. + + initial_strategy : str, optional (default="mean") + Which strategy to use to initialize the missing values. Same as the + ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` + Valid values: {"mean", "median", "most_frequent", or "constant"}. + + min_value : float, optional (default=None) + Minimum possible imputed value. Default of ``None`` will set minimum + to negative infinity. + + max_value : float, optional (default=None) + Maximum possible imputed value. Default of ``None`` will set maximum + to positive infinity. + + verbose : int, optional (default=0) + Verbosity flag, controls the debug messages that are issued + as functions are evaluated. The higher, the more verbose. Can be 0, 1, + or 2. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by ``np.random``. + + Attributes + ---------- + initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`' + The imputer used to initialize the missing values. + + imputation_sequence_ : list of tuples + Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where + ``feat_idx`` is the current feature to be imputed, + ``neighbor_feat_idx`` is the array of other features used to impute the + current feature, and ``predictor`` is the trained predictor used for + the imputation. + + Notes + ----- + The R version of MICE does not have inductive functionality, i.e. first + fitting on ``X_train`` and then transforming any ``X_test`` without + additional fitting. We do this by storing each feature's predictor during + the round-robin ``fit`` phase, and predicting without refitting (in order) + during the ``transform`` phase. + + Features which contain all missing values at ``fit`` are discarded upon + ``transform``. + + Features with missing values in transform which did not have any missing + values in fit will be imputed with the initial imputation method only. + + References + ---------- + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + """ + + def __init__(self, + missing_values=np.nan, + imputation_order='ascending', + n_imputations=100, + n_burn_in=10, + predictor=None, + n_nearest_features=None, + initial_strategy="mean", + min_value=None, + max_value=None, + verbose=False, + random_state=None): + + self.missing_values = missing_values + self.imputation_order = imputation_order + self.n_imputations = n_imputations + self.n_burn_in = n_burn_in + self.predictor = predictor + self.n_nearest_features = n_nearest_features + self.initial_strategy = initial_strategy + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.random_state = random_state + + def _impute_one_feature(self, + X_filled, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + predictor=None, + fit_mode=True): + """Impute a single feature from the others provided. + + This function predicts the missing values of one of the features using + the current estimates of all the other features. The ``predictor`` must + support ``return_std=True`` in its ``predict`` method for this function + to work. + + Parameters + ---------- + X_filled : ndarray + Input data with the most recent imputations. + + mask_missing_values : ndarray + Input data's missing indicator matrix. + + feat_idx : int + Index of the feature currently being imputed. + + neighbor_feat_idx : ndarray + Indices of the features to be used in imputing ``feat_idx``. + + predictor : object + The predictor to use at this step of the round-robin imputation. + It must support ``return_std`` in its ``predict`` method. + If None, it will be cloned from self._predictor. + + fit_mode : boolean, default=True + Whether to fit and predict with the predictor or just predict. + + Returns + ------- + X_filled : ndarray + Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. + + predictor : predictor with sklearn API + The fitted predictor used to impute + ``X_filled[missing_row_mask, feat_idx]``. + """ + + # if nothing is missing, just return the default + # (should not happen at fit time because feat_ids would be excluded) + missing_row_mask = mask_missing_values[:, feat_idx] + if not np.any(missing_row_mask): + return X_filled, predictor + + if predictor is None and fit_mode is False: + raise ValueError("If fit_mode is False, then an already-fitted " + "predictor should be passed in.") + + if predictor is None: + predictor = clone(self._predictor) + + if fit_mode: + X_train = safe_indexing(X_filled[:, neighbor_feat_idx], + ~missing_row_mask) + y_train = safe_indexing(X_filled[:, feat_idx], + ~missing_row_mask) + predictor.fit(X_train, y_train) + + # get posterior samples + X_test = safe_indexing(X_filled[:, neighbor_feat_idx], + missing_row_mask) + mus, sigmas = predictor.predict(X_test, return_std=True) + good_sigmas = sigmas > 0 + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + imputed_values[~good_sigmas] = mus[~good_sigmas] + imputed_values[good_sigmas] = self.random_state_.normal( + loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + + # clip the values + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) + + # update the feature + X_filled[missing_row_mask, feat_idx] = imputed_values + return X_filled, predictor + + def _get_neighbor_feat_idx(self, + n_features, + feat_idx, + abs_corr_mat): + """Get a list of other features to predict ``feat_idx``. + + If self.n_nearest_features is less than or equal to the total + number of features, then use a probability proportional to the absolute + correlation between ``feat_idx`` and each other feature to randomly + choose a subsample of the other features (without replacement). + + Parameters + ---------- + n_features : int + Number of features in ``X``. + + feat_idx : int + Index of the feature currently being imputed. + + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X``. The diagonal has been zeroed + out and each feature has been normalized to sum to 1. Can be None. + + Returns + ------- + neighbor_feat_idx : array-like + The features to use to impute ``feat_idx``. + """ + if (self.n_nearest_features is not None and + self.n_nearest_features < n_features): + p = abs_corr_mat[:, feat_idx] + neighbor_feat_idx = self.random_state_.choice( + np.arange(n_features), self.n_nearest_features, replace=False, + p=p) + else: + inds_left = np.arange(feat_idx) + inds_right = np.arange(feat_idx + 1, n_features) + neighbor_feat_idx = np.concatenate((inds_left, inds_right)) + return neighbor_feat_idx + + def _get_ordered_idx(self, mask_missing_values): + """Decide in what order we will update the features. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function skips features which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + + Returns + ------- + ordered_idx : ndarray, shape (n_features,) + The order in which to impute the features. + """ + frac_of_missing_values = mask_missing_values.mean(axis=0) + missing_values_idx = np.nonzero(frac_of_missing_values)[0] + if self.imputation_order == 'roman': + ordered_idx = missing_values_idx + elif self.imputation_order == 'arabic': + ordered_idx = missing_values_idx[::-1] + elif self.imputation_order == 'ascending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:][::-1] + elif self.imputation_order == 'descending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:] + elif self.imputation_order == 'random': + ordered_idx = missing_values_idx + self.random_state_.shuffle(ordered_idx) + else: + raise ValueError("Got an invalid imputation order: '{0}'. It must " + "be one of the following: 'roman', 'arabic', " + "'ascending', 'descending', or " + "'random'.".format(self.imputation_order)) + return ordered_idx + + def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): + """Get absolute correlation matrix between features. + + Parameters + ---------- + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + tolerance : float, optional (default=1e-6) + ``abs_corr_mat`` can have nans, which will be replaced + with ``tolerance``. + + Returns + ------- + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X`` at the beginning of the + current round. The diagonal has been zeroed out and each feature's + absolute correlations with all others have been normalized to sum + to 1. + """ + n_features = X_filled.shape[1] + if (self.n_nearest_features is None or + self.n_nearest_features >= n_features): + return None + abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for features with zero std + abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance + # ensures exploration, i.e. at least some probability of sampling + np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) + # features are not their own neighbors + np.fill_diagonal(abs_corr_mat, 0) + # needs to sum to 1 for np.random.choice sampling + abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) + return abs_corr_mat + + def _initial_imputation(self, X): + """Perform initial imputation for input X. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + Returns + ------- + Xt : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + mask_missing_values : ndarray, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + """ + if is_scalar_nan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = check_array(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + + mask_missing_values = _get_mask(X, self.missing_values) + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) + + valid_mask = np.flatnonzero(np.logical_not( + np.isnan(self.initial_imputer_.statistics_))) + Xt = X[:, valid_mask] + mask_missing_values = mask_missing_values[:, valid_mask] + + return Xt, X_filled, mask_missing_values + + def fit_transform(self, X, y=None): + """Fits the imputer on X and return the transformed X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + self.random_state_ = getattr(self, "random_state_", + check_random_state(self.random_state)) + + if self.predictor is None: + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() + else: + self._predictor = clone(self.predictor) + + self._min_value = np.nan if self.min_value is None else self.min_value + self._max_value = np.nan if self.max_value is None else self.max_value + + self.initial_imputer_ = None + X, X_filled, mask_missing_values = self._initial_imputation(X) + + # edge case: in case the user specifies 0 for n_imputations, + # then there is no need to do burn in and the result should be + # just the initial imputation (before clipping) + if self.n_imputations < 1: + return X_filled + + X_filled = np.clip(X_filled, self._min_value, self._max_value) + + # order in which to impute + # note this is probably too slow for large feature data (d > 100000) + # and a better way would be good. + # see: https://goo.gl/KyCNwj and subsequent comments + ordered_idx = self._get_ordered_idx(mask_missing_values) + + abs_corr_mat = self._get_abs_corr_mat(X_filled) + + # impute data + n_rounds = self.n_burn_in + self.n_imputations + n_samples, n_features = X_filled.shape + Xt = np.zeros((n_samples, n_features), dtype=X.dtype) + self.imputation_sequence_ = [] + if self.verbose > 0: + print("[ChainedImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + for i_rnd in range(n_rounds): + if self.imputation_order == 'random': + ordered_idx = self._get_ordered_idx(mask_missing_values) + + for feat_idx in ordered_idx: + neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, + feat_idx, + abs_corr_mat) + X_filled, predictor = self._impute_one_feature( + X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, + predictor=None, fit_mode=True) + predictor_triplet = ImputerTriplet(feat_idx, + neighbor_feat_idx, + predictor) + self.imputation_sequence_.append(predictor_triplet) + + if i_rnd >= self.n_burn_in: + Xt += X_filled + if self.verbose > 0: + print('[ChainedImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (i_rnd + 1, n_rounds, time() - start_t)) + + Xt /= self.n_imputations + Xt[~mask_missing_values] = X[~mask_missing_values] + return Xt + + def transform(self, X): + """Imputes all missing values in X. + + Note that this is stochastic, and that if random_state is not fixed, + repeated calls, or permuted input, will yield different results. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + check_is_fitted(self, 'initial_imputer_') + + X, X_filled, mask_missing_values = self._initial_imputation(X) + + # edge case: in case the user specifies 0 for n_imputations, + # then there is no need to do burn in and the result should be + # just the initial imputation (before clipping) + if self.n_imputations < 1: + return X_filled + + X_filled = np.clip(X_filled, self._min_value, self._max_value) + + n_rounds = self.n_burn_in + self.n_imputations + n_imputations = len(self.imputation_sequence_) + imputations_per_round = n_imputations // n_rounds + i_rnd = 0 + Xt = np.zeros(X.shape, dtype=X.dtype) + if self.verbose > 0: + print("[ChainedImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + for it, predictor_triplet in enumerate(self.imputation_sequence_): + X_filled, _ = self._impute_one_feature( + X_filled, + mask_missing_values, + predictor_triplet.feat_idx, + predictor_triplet.neighbor_feat_idx, + predictor=predictor_triplet.predictor, + fit_mode=False + ) + if not (it + 1) % imputations_per_round: + if i_rnd >= self.n_burn_in: + Xt += X_filled + if self.verbose > 1: + print('[ChainedImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (i_rnd + 1, n_rounds, time() - start_t)) + i_rnd += 1 + + Xt /= self.n_imputations + Xt[~mask_missing_values] = X[~mask_missing_values] + return Xt + + def fit(self, X, y=None): + """Fits the imputer on X and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored + + Returns + ------- + self : object + Returns self. + """ + self.fit_transform(X) + return self + + class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f25f76e611d77..f9c3e4902f145 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1,3 +1,5 @@ +from __future__ import division + import pytest import numpy as np @@ -12,7 +14,9 @@ from sklearn.utils.testing import assert_false from sklearn.impute import MissingIndicator -from sklearn.impute import SimpleImputer +from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.dummy import DummyRegressor +from sklearn.linear_model import BayesianRidge, ARDRegression from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -69,6 +73,10 @@ def test_imputation_shape(): X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) + chained_imputer = ChainedImputer(initial_strategy=strategy) + X_imputed = chained_imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) + @pytest.mark.parametrize("strategy", ["const", 101, None]) def test_imputation_error_invalid_strategy(strategy): @@ -500,6 +508,227 @@ def test_imputation_copy(): # made, even if copy=False. +def test_chained_imputer_rank_one(): + rng = np.random.RandomState(0) + d = 100 + A = rng.rand(d, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(d, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = ChainedImputer(n_imputations=5, + n_burn_in=5, + verbose=True, + random_state=rng) + X_filled = imputer.fit_transform(X_missing) + assert_allclose(X_filled, X, atol=0.001) + + +@pytest.mark.parametrize( + "imputation_order", + ['random', 'roman', 'ascending', 'descending', 'arabic'] +) +def test_chained_imputer_imputation_order(imputation_order): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 # this column should not be discarded by ChainedImputer + + imputer = ChainedImputer(missing_values=0, + n_imputations=1, + n_burn_in=1, + n_nearest_features=5, + min_value=0, + max_value=1, + verbose=False, + imputation_order=imputation_order, + random_state=rng) + imputer.fit_transform(X) + ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] + if imputation_order == 'roman': + assert np.all(ordered_idx[:d-1] == np.arange(1, d)) + elif imputation_order == 'arabic': + assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) + elif imputation_order == 'random': + ordered_idx_round_1 = ordered_idx[:d-1] + ordered_idx_round_2 = ordered_idx[d-1:] + assert ordered_idx_round_1 != ordered_idx_round_2 + elif 'ending' in imputation_order: + assert len(ordered_idx) == 2 * (d - 1) + + +@pytest.mark.parametrize( + "predictor", + [DummyRegressor(), BayesianRidge(), ARDRegression()] +) +def test_chained_imputer_predictors(predictor): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + + imputer = ChainedImputer(missing_values=0, + n_imputations=1, + n_burn_in=1, + predictor=predictor, + random_state=rng) + imputer.fit_transform(X) + + # check that types are correct for predictors + hashes = [] + for triplet in imputer.imputation_sequence_: + assert triplet.predictor + hashes.append(id(triplet.predictor)) + + # check that each predictor is unique + assert len(set(hashes)) == len(hashes) + + +def test_chained_imputer_clip(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, + random_state=rng).toarray() + + imputer = ChainedImputer(missing_values=0, + n_imputations=1, + n_burn_in=1, + min_value=0.1, + max_value=0.2, + random_state=rng) + + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + +@pytest.mark.parametrize( + "strategy", + ["mean", "median", "most_frequent"] +) +def test_chained_imputer_missing_at_transform(strategy): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X_train = rng.randint(low=0, high=3, size=(n, d)) + X_test = rng.randint(low=0, high=3, size=(n, d)) + + X_train[:, 0] = 1 # definitely no missing values in 0th column + X_test[0, 0] = 0 # definitely missing value in 0th column + + imputer = ChainedImputer(missing_values=0, + n_imputations=1, + n_burn_in=1, + initial_strategy=strategy, + random_state=rng).fit(X_train) + initial_imputer = SimpleImputer(missing_values=0, + strategy=strategy).fit(X_train) + + # if there were no missing values at time of fit, then imputer will + # only use the initial imputer for that feature at transform + assert np.all(imputer.transform(X_test)[:, 0] == + initial_imputer.transform(X_test)[:, 0]) + + +def test_chained_imputer_transform_stochasticity(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, + random_state=rng).toarray() + + imputer = ChainedImputer(missing_values=0, + n_imputations=1, + n_burn_in=1, + random_state=rng) + imputer.fit(X) + + X_fitted_1 = imputer.transform(X) + X_fitted_2 = imputer.transform(X) + + # sufficient to assert that the means are not the same + assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) + + +def test_chained_imputer_no_missing(): + rng = np.random.RandomState(0) + X = rng.rand(100, 100) + X[:, 0] = np.nan + m1 = ChainedImputer(n_imputations=10, random_state=rng) + m2 = ChainedImputer(n_imputations=10, random_state=rng) + pred1 = m1.fit(X).transform(X) + pred2 = m2.fit_transform(X) + # should exclude the first column entirely + assert_allclose(X[:, 1:], pred1) + # fit and fit_transform should both be identical + assert_allclose(pred1, pred2) + + +@pytest.mark.parametrize( + "rank", + [3, 5] +) +def test_chained_imputer_transform_recovery(rank): + rng = np.random.RandomState(0) + n = 100 + d = 100 + A = rng.rand(n, rank) + B = rng.rand(rank, d) + X_filled = np.dot(A, B) + # half is randomly missing + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data in half + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = ChainedImputer(n_imputations=10, + n_burn_in=10, + verbose=True, + random_state=rng).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1) + + +def test_chained_imputer_additive_matrix(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + A = rng.randn(n, d) + B = rng.randn(n, d) + X_filled = np.zeros(A.shape) + for i in range(d): + for j in range(d): + X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 + # a quarter is randomly missing + nan_mask = rng.rand(n, d) < 0.25 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = ChainedImputer(n_imputations=25, + n_burn_in=10, + verbose=True, + random_state=rng).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, atol=0.01) + + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), @@ -616,7 +845,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, @pytest.mark.parametrize("imputer_constructor", - [SimpleImputer]) + [SimpleImputer, ChainedImputer]) @pytest.mark.parametrize( "imputer_missing_values, missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 54369033a75d3..931e50d920402 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -77,7 +77,7 @@ 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] -ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', +ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', 'PowerTransformer', 'QuantileTransformer'] From cbf89ecfd74138510e158ee2dd0d5fb2ba40557a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 3 Sep 2018 14:33:56 +1000 Subject: [PATCH 02/20] Fix import of time --- sklearn/impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index b9afbc76bd2d8..89fb33a4f9034 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -7,7 +7,7 @@ import warnings import numbers -import time +from time import time import numpy as np import numpy.ma as ma From a4f2a89546a455a914a08f7e651e5e85823c0c16 Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Mon, 17 Sep 2018 02:15:28 +0300 Subject: [PATCH 03/20] [MRG] ChainedImputer -> IterativeImputer, and documentation update (#11350) Towards making this more generic than MICE --- doc/modules/classes.rst | 2 +- doc/modules/impute.rst | 83 ++++++++----- doc/whats_new/v0.21.rst | 2 +- examples/plot_missing_values.py | 18 +-- sklearn/impute.py | 197 +++++++++++++++++------------- sklearn/tests/test_impute.py | 188 ++++++++++++++++------------ sklearn/utils/estimator_checks.py | 3 +- 7 files changed, 283 insertions(+), 210 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2d3174a9dcc05..ab0f473be4083 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -656,7 +656,7 @@ Kernels: :template: class.rst impute.SimpleImputer - impute.ChainedImputer + impute.IterativeImputer impute.MissingIndicator .. _kernel_approximation_ref: diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 268ce1c3ede19..8bb3ad8bf940b 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -24,7 +24,7 @@ One type of imputation algorithm is univariate, which imputes values in the i-th feature dimension using only non-missing values in that feature dimension (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the -missing values (e.g. :class:`impute.ChainedImputer`). +missing values (e.g. :class:`impute.IterativeImputer`). .. _single_imputer: @@ -87,37 +87,37 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] -.. _chained_imputer: +.. _iterative_imputer: Multivariate feature imputation =============================== -A more sophisticated approach is to use the :class:`ChainedImputer` class, which -implements the imputation technique from MICE (Multivariate Imputation by -Chained Equations). MICE models each feature with missing values as a function of -other features, and uses that estimate for imputation. It does so in a round-robin -fashion: at each step, a feature column is designated as output `y` and the other -feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`. -Then, the regressor is used to predict the unknown values of `y`. This is repeated -for each feature in a chained fashion, and then is done for a number of imputation -rounds. Here is an example snippet:: +A more sophisticated approach is to use the :class:`IterativeImputer` class, +which models each feature with missing values as a function of other features, +and uses that estimate for imputation. It does so in an iterated round-robin +fashion: at each step, a feature column is designated as output ``y`` and the +other feature columns are treated as inputs ``X``. A regressor is fit on ``(X, +y)`` for known ``y``. Then, the regressor is used to predict the missing values +of ``y``. This is done for each feature in an iterative fashion, and then is +repeated for ``n_iter`` imputation rounds. The results of the final imputation +round are returned. >>> import numpy as np - >>> from sklearn.impute import ChainedImputer - >>> imp = ChainedImputer(n_imputations=10, random_state=0) + >>> from sklearn.impute import IterativeImputer + >>> imp = IterativeImputer(n_iter=10, random_state=0) >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) - ChainedImputer(imputation_order='ascending', initial_strategy='mean', - max_value=None, min_value=None, missing_values=nan, n_burn_in=10, - n_imputations=10, n_nearest_features=None, predictor=None, - random_state=0, verbose=False) + IterativeImputer(imputation_order='ascending', initial_strategy='mean', + max_value=None, min_value=None, missing_values=nan, n_iter=10, + n_nearest_features=None, predictor=None, random_state=0, + sample_posterior=False, verbose=False) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] - [ 6. 4.] - [13. 6.]] + [ 6. 3.] + [24. 6.]] -Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline +Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. @@ -127,21 +127,40 @@ Multiple vs. Single Imputation ============================== In the statistics community, it is common practice to perform multiple imputations, -generating, for example, 10 separate imputations for a single feature matrix. -Each of these 10 imputations is then put through the subsequent analysis pipeline -(e.g. feature engineering, clustering, regression, classification). The 10 final -analysis results (e.g. held-out validation error) allow the data scientist to -obtain understanding of the uncertainty inherent in the missing values. The above -practice is called multiple imputation. As implemented, the :class:`ChainedImputer` -class generates a single (averaged) imputation for each missing value because this -is the most common use case for machine learning applications. However, it can also be used -for multiple imputations by applying it repeatedly to the same dataset with different -random seeds with the ``n_imputations`` parameter set to 1. - -Note that a call to the ``transform`` method of :class:`ChainedImputer` is not +generating, for example, ``m`` separate imputations for a single feature matrix. +Each of these ``m`` imputations is then put through the subsequent analysis pipeline +(e.g. feature engineering, clustering, regression, classification). The ``m`` final +analysis results (e.g. held-out validation errors) allow the data scientist +to obtain understanding of how analytic results may differ as a consequence +of the inherent uncertainty caused by the missing values. The above practice +is called multiple imputation. + +Our implementation of :class:`IterativeImputer` was inspired by the R MICE +package (Multivariate Imputation by Chained Equations) [1]_, but differs from +it by returning a single imputation instead of multiple imputations. However, +:class:`IterativeImputer` can also be used for multiple imputations by applying +it repeatedly to the same dataset with different random seeds when +``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple +vs. single imputations. + +It is still an open problem as to how useful single vs. multiple imputation is in +the context of prediction and classification when the user is not interested in +measuring uncertainty due to missing values. + +Note that a call to the ``transform`` method of :class:`IterativeImputer` is not allowed to change the number of samples. Therefore multiple imputations cannot be achieved by a single call to ``transform``. +References +========== + +.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate + Imputation by Chained Equations in R". Journal of Statistical Software 45: + 1-67. + +.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis + with Missing Data". John Wiley & Sons, Inc., New York, NY, USA. + .. _missing_indicator: Marking imputed values diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2c010e5b1be59..2159e39dc126d 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -43,7 +43,7 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.impute` ..................... -- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. :issue:`8478` by :user:`Sergey Feldman `. diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 3ab1cfff95576..43d7ddfc497f3 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -4,11 +4,11 @@ ==================================================== Missing values can be replaced by the mean, the median or the most frequent -value using the basic :func:`sklearn.impute.SimpleImputer`. +value using the basic :class:`sklearn.impute.SimpleImputer`. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). -Another option is the :func:`sklearn.impute.ChainedImputer`. This uses +Another option is the :class:`sklearn.impute.IterativeImputer`. This uses round-robin linear regression, treating every variable as an output in turn. The version implemented assumes Gaussian (output) variables. If your features are obviously non-Normal, consider transforming them to look more @@ -26,7 +26,7 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import SimpleImputer, ChainedImputer, MissingIndicator +from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) @@ -73,18 +73,18 @@ def get_results(dataset): scoring='neg_mean_squared_error', cv=5) - # Estimate the score after chained imputation of the missing values + # Estimate the score after iterative imputation of the missing values estimator = make_pipeline( - make_union(ChainedImputer(missing_values=0, random_state=0), + make_union(IterativeImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) - chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error') + iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), - (chained_impute_scores.mean(), chained_impute_scores.std())) + (iterative_impute_scores.mean(), iterative_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) @@ -101,7 +101,7 @@ def get_results(dataset): x_labels = ['Full data', 'Zero imputation', 'Mean Imputation', - 'Chained Imputation'] + 'Multivariate Imputation'] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results diff --git a/sklearn/impute.py b/sklearn/impute.py index 89fb33a4f9034..3035040c1179a 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -37,7 +37,7 @@ __all__ = [ 'MissingIndicator', 'SimpleImputer', - 'ChainedImputer', + 'IterativeImputer', ] @@ -149,6 +149,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin): statistics_ : array of shape (n_features,) The imputation fill value for each feature. + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + Examples -------- >>> import numpy as np @@ -420,14 +424,13 @@ def transform(self, X): return X -class ChainedImputer(BaseEstimator, TransformerMixin): - """Chained imputer transformer to impute missing values. +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. - Basic implementation of chained imputer from MICE (Multivariate - Imputations by Chained Equations) package from R. This version assumes all - of the features are Gaussian. + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -449,24 +452,34 @@ class ChainedImputer(BaseEstimator, TransformerMixin): "random" A random order for each round. - n_imputations : int, optional (default=100) - Number of chained imputation rounds to perform, the results of which - will be used in the final average. + n_iter : int, optional (default=10) + Number of imputation rounds to perform before returning the imputations + computed during the final round. A round is a single imputation of each + feature with missing values. - n_burn_in : int, optional (default=10) - Number of initial imputation rounds to perform the results of which - will not be returned. - - predictor : estimator object, default=BayesianRidge() + predictor : estimator object, default=RidgeCV() or BayesianRidge() The predictor to use at each step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. Also, if + ``sample_posterior=True`` the default predictor will be + :class:`sklearn.linear_model.BayesianRidge` and + :class:`sklearn.linear_model.RidgeCV` otherwise. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted predictor for each imputation. Predictor must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. n_nearest_features : int, optional (default=None) Number of other features to use to estimate the missing values of - the each feature column. Nearness between features is measured using + each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after - initial imputation). Can provide significant speed-up when the number - of features is huge. If ``None``, all features will be used. + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. initial_strategy : str, optional (default="mean") Which strategy to use to initialize the missing values. Same as the @@ -487,37 +500,43 @@ class ChainedImputer(BaseEstimator, TransformerMixin): or 2. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by ``np.random``. + The seed of the pseudo random number generator to use. Randomizes + selection of predictor features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. Attributes ---------- - initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`' - The imputer used to initialize the missing values. + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. imputation_sequence_ : list of tuples Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the current feature, and ``predictor`` is the trained predictor used for - the imputation. + the imputation. Length is ``self.n_features_with_missing_ * n_iter``. + + n_features_with_missing_ : int + Number of features with missing values. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. Notes ----- - The R version of MICE does not have inductive functionality, i.e. first - fitting on ``X_train`` and then transforming any ``X_test`` without - additional fitting. We do this by storing each feature's predictor during - the round-robin ``fit`` phase, and predicting without refitting (in order) - during the ``transform`` phase. + To support imputation in inductive mode we store each feature's predictor + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``. - Features with missing values in transform which did not have any missing - values in fit will be imputed with the initial imputation method only. + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. References ---------- @@ -525,14 +544,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin): Multivariate Imputation by Chained Equations in R". Journal of Statistical Software 45: 1-67. `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ """ def __init__(self, missing_values=np.nan, imputation_order='ascending', - n_imputations=100, - n_burn_in=10, + n_iter=10, predictor=None, + sample_posterior=False, n_nearest_features=None, initial_strategy="mean", min_value=None, @@ -542,9 +566,9 @@ def __init__(self, self.missing_values = missing_values self.imputation_order = imputation_order - self.n_imputations = n_imputations - self.n_burn_in = n_burn_in + self.n_iter = n_iter self.predictor = predictor + self.sample_posterior = sample_posterior self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy self.min_value = min_value @@ -582,7 +606,8 @@ def _impute_one_feature(self, predictor : object The predictor to use at this step of the round-robin imputation. - It must support ``return_std`` in its ``predict`` method. + If ``sample_posterior`` is True, the predictor must support + ``return_std`` in its ``predict`` method. If None, it will be cloned from self._predictor. fit_mode : boolean, default=True @@ -621,12 +646,15 @@ def _impute_one_feature(self, # get posterior samples X_test = safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) - mus, sigmas = predictor.predict(X_test, return_std=True) - good_sigmas = sigmas > 0 - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - imputed_values[~good_sigmas] = mus[~good_sigmas] - imputed_values[good_sigmas] = self.random_state_.normal( - loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + if self.sample_posterior: + mus, sigmas = predictor.predict(X_test, return_std=True) + good_sigmas = sigmas > 0 + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + imputed_values[~good_sigmas] = mus[~good_sigmas] + imputed_values[good_sigmas] = self.random_state_.normal( + loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + else: + imputed_values = predictor.predict(X_test) # clip the values imputed_values = np.clip(imputed_values, @@ -822,44 +850,51 @@ def fit_transform(self, X, y=None): self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) + if self.n_iter < 0: + raise ValueError( + "'n_iter' should be a positive integer. Got {} instead." + .format(self.n_iter)) + if self.predictor is None: - from .linear_model import BayesianRidge - self._predictor = BayesianRidge() + if self.sample_posterior: + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() + else: + from .linear_model import RidgeCV + # including a very small alpha to approximate OLS + self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1, 1, 10])) else: self._predictor = clone(self.predictor) + if hasattr(self._predictor, 'random_state'): + self._predictor.random_state = self.random_state_ + self._min_value = np.nan if self.min_value is None else self.min_value self._max_value = np.nan if self.max_value is None else self.max_value self.initial_imputer_ = None - X, X_filled, mask_missing_values = self._initial_imputation(X) - - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + X, Xt, mask_missing_values = self._initial_imputation(X) - X_filled = np.clip(X_filled, self._min_value, self._max_value) + if self.n_iter == 0: + return Xt # order in which to impute # note this is probably too slow for large feature data (d > 100000) # and a better way would be good. # see: https://goo.gl/KyCNwj and subsequent comments ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) - abs_corr_mat = self._get_abs_corr_mat(X_filled) + abs_corr_mat = self._get_abs_corr_mat(Xt) # impute data - n_rounds = self.n_burn_in + self.n_imputations - n_samples, n_features = X_filled.shape - Xt = np.zeros((n_samples, n_features), dtype=X.dtype) + n_samples, n_features = Xt.shape self.imputation_sequence_ = [] if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for i_rnd in range(n_rounds): + for i_rnd in range(self.n_iter): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) @@ -867,22 +902,19 @@ def fit_transform(self, X, y=None): neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat) - X_filled, predictor = self._impute_one_feature( - X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, + Xt, predictor = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, predictor=None, fit_mode=True) predictor_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, predictor) self.imputation_sequence_.append(predictor_triplet) - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 0: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -904,28 +936,20 @@ def transform(self, X): """ check_is_fitted(self, 'initial_imputer_') - X, X_filled, mask_missing_values = self._initial_imputation(X) - - # edge case: in case the user specifies 0 for n_imputations, - # then there is no need to do burn in and the result should be - # just the initial imputation (before clipping) - if self.n_imputations < 1: - return X_filled + X, Xt, mask_missing_values = self._initial_imputation(X) - X_filled = np.clip(X_filled, self._min_value, self._max_value) + if self.n_iter == 0: + return Xt - n_rounds = self.n_burn_in + self.n_imputations - n_imputations = len(self.imputation_sequence_) - imputations_per_round = n_imputations // n_rounds + imputations_per_round = len(self.imputation_sequence_) // self.n_iter i_rnd = 0 - Xt = np.zeros(X.shape, dtype=X.dtype) if self.verbose > 0: - print("[ChainedImputer] Completing matrix with shape %s" + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() for it, predictor_triplet in enumerate(self.imputation_sequence_): - X_filled, _ = self._impute_one_feature( - X_filled, + Xt, _ = self._impute_one_feature( + Xt, mask_missing_values, predictor_triplet.feat_idx, predictor_triplet.neighbor_feat_idx, @@ -933,15 +957,12 @@ def transform(self, X): fit_mode=False ) if not (it + 1) % imputations_per_round: - if i_rnd >= self.n_burn_in: - Xt += X_filled if self.verbose > 1: - print('[ChainedImputer] Ending imputation round ' + print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, n_rounds, time() - start_t)) + % (i_rnd + 1, self.n_iter, time() - start_t)) i_rnd += 1 - Xt /= self.n_imputations Xt[~mask_missing_values] = X[~mask_missing_values] return Xt diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f9c3e4902f145..dd246cc3e8c4d 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -14,9 +14,9 @@ from sklearn.utils.testing import assert_false from sklearn.impute import MissingIndicator -from sklearn.impute import SimpleImputer, ChainedImputer +from sklearn.impute import SimpleImputer, IterativeImputer from sklearn.dummy import DummyRegressor -from sklearn.linear_model import BayesianRidge, ARDRegression +from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -73,8 +73,8 @@ def test_imputation_shape(): X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) - chained_imputer = ChainedImputer(initial_strategy=strategy) - X_imputed = chained_imputer.fit_transform(X) + iterative_imputer = IterativeImputer(initial_strategy=strategy) + X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2) @@ -508,46 +508,31 @@ def test_imputation_copy(): # made, even if copy=False. -def test_chained_imputer_rank_one(): - rng = np.random.RandomState(0) - d = 100 - A = rng.rand(d, 1) - B = rng.rand(1, d) - X = np.dot(A, B) - nan_mask = rng.rand(d, d) < 0.5 - X_missing = X.copy() - X_missing[nan_mask] = np.nan - - imputer = ChainedImputer(n_imputations=5, - n_burn_in=5, - verbose=True, - random_state=rng) - X_filled = imputer.fit_transform(X_missing) - assert_allclose(X_filled, X, atol=0.001) - - @pytest.mark.parametrize( "imputation_order", ['random', 'roman', 'ascending', 'descending', 'arabic'] ) -def test_chained_imputer_imputation_order(imputation_order): +def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - X[:, 0] = 1 # this column should not be discarded by ChainedImputer - - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - n_nearest_features=5, - min_value=0, - max_value=1, - verbose=False, - imputation_order=imputation_order, - random_state=rng) + X[:, 0] = 1 # this column should not be discarded by IterativeImputer + + n_iter = 2 + imputer = IterativeImputer(missing_values=0, + n_iter=n_iter, + n_nearest_features=5, + min_value=0, + max_value=1, + verbose=False, + imputation_order=imputation_order, + random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] + + assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_ + if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': @@ -557,25 +542,24 @@ def test_chained_imputer_imputation_order(imputation_order): ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: - assert len(ordered_idx) == 2 * (d - 1) + assert len(ordered_idx) == n_iter * (d - 1) @pytest.mark.parametrize( "predictor", - [DummyRegressor(), BayesianRidge(), ARDRegression()] + [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] ) -def test_chained_imputer_predictors(predictor): +def test_iterative_imputer_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - predictor=predictor, - random_state=rng) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + predictor=predictor, + random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors @@ -588,19 +572,18 @@ def test_chained_imputer_predictors(predictor): assert len(set(hashes)) == len(hashes) -def test_chained_imputer_clip(): +def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - min_value=0.1, - max_value=0.2, - random_state=rng) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + min_value=0.1, + max_value=0.2, + random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) @@ -612,7 +595,7 @@ def test_chained_imputer_clip(): "strategy", ["mean", "median", "most_frequent"] ) -def test_chained_imputer_missing_at_transform(strategy): +def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 @@ -622,11 +605,10 @@ def test_chained_imputer_missing_at_transform(strategy): X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - initial_strategy=strategy, - random_state=rng).fit(X_train) + imputer = IterativeImputer(missing_values=0, + n_iter=1, + initial_strategy=strategy, + random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) @@ -636,17 +618,19 @@ def test_chained_imputer_missing_at_transform(strategy): initial_imputer.transform(X_test)[:, 0]) -def test_chained_imputer_transform_stochasticity(): - rng = np.random.RandomState(0) +def test_iterative_imputer_transform_stochasticity(): + rng1 = np.random.RandomState(0) + rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, - random_state=rng).toarray() + random_state=rng1).toarray() - imputer = ChainedImputer(missing_values=0, - n_imputations=1, - n_burn_in=1, - random_state=rng) + # when sample_posterior=True, two transforms shouldn't be equal + imputer = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=True, + random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) @@ -655,13 +639,39 @@ def test_chained_imputer_transform_stochasticity(): # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) - -def test_chained_imputer_no_missing(): + # when sample_posterior=False, and n_nearest_features=None + # and imputation_order is not random + # the two transforms should be identical even if rng are different + imputer1 = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng1) + + imputer2 = IterativeImputer(missing_values=0, + n_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng2) + imputer1.fit(X) + imputer2.fit(X) + + X_fitted_1a = imputer1.transform(X) + X_fitted_1b = imputer1.transform(X) + X_fitted_2 = imputer2.transform(X) + + assert np.all(X_fitted_1a == X_fitted_1b) + assert np.all(X_fitted_1a == X_fitted_2) + + +def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan - m1 = ChainedImputer(n_imputations=10, random_state=rng) - m2 = ChainedImputer(n_imputations=10, random_state=rng) + m1 = IterativeImputer(n_iter=10, random_state=rng) + m2 = IterativeImputer(n_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely @@ -670,11 +680,28 @@ def test_chained_imputer_no_missing(): assert_allclose(pred1, pred2) +def test_iterative_imputer_rank_one(): + rng = np.random.RandomState(0) + d = 100 + A = rng.rand(d, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(d, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(n_iter=5, + verbose=1, + random_state=rng) + X_filled = imputer.fit_transform(X_missing) + assert_allclose(X_filled, X, atol=0.01) + + @pytest.mark.parametrize( "rank", [3, 5] ) -def test_chained_imputer_transform_recovery(rank): +def test_iterative_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 @@ -692,15 +719,14 @@ def test_chained_imputer_transform_recovery(rank): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = ChainedImputer(n_imputations=10, - n_burn_in=10, - verbose=True, - random_state=rng).fit(X_train) + imputer = IterativeImputer(n_iter=10, + verbose=1, + random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) - assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1) + assert_allclose(X_test_filled, X_test_est, atol=0.1) -def test_chained_imputer_additive_matrix(): +def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 @@ -721,14 +747,20 @@ def test_chained_imputer_additive_matrix(): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = ChainedImputer(n_imputations=25, - n_burn_in=10, - verbose=True, - random_state=rng).fit(X_train) + imputer = IterativeImputer(n_iter=10, + verbose=2, + random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.01) +def test_iterative_imputer_error_param(): + rng = np.random.RandomState(42) + X = rng.randn(100, 2) + imputer = IterativeImputer(n_iter=-1) + with pytest.raises(ValueError, match='should be a positive integer'): + imputer.fit_transform(X) + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), @@ -845,7 +877,7 @@ def test_missing_indicator_sparse_param(arr_type, missing_values, @pytest.mark.parametrize("imputer_constructor", - [SimpleImputer, ChainedImputer]) + [SimpleImputer, IterativeImputer]) @pytest.mark.parametrize( "imputer_missing_values, missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 931e50d920402..a59c1b8cd6e6b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -77,7 +77,8 @@ 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] -ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer', 'MissingIndicator', +ALLOW_NAN = ['Imputer', 'SimpleImputer', 'IterativeImputer', + 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', 'PowerTransformer', 'QuantileTransformer'] From 09a9a21ba1be7b1ef0722213a475bbabad47bf9f Mon Sep 17 00:00:00 2001 From: Ben Lawson Date: Fri, 5 Oct 2018 16:01:34 -0400 Subject: [PATCH 04/20] [MRG] sample from a truncated normal instead of clipping samples from a normal (#12177) --- doc/whats_new/v0.21.rst | 4 ++++ sklearn/impute.py | 22 +++++++++++++--------- sklearn/tests/test_impute.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2159e39dc126d..0cc7be8e7a6aa 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -48,6 +48,10 @@ Support for Python 3.4 and below has been officially dropped. function of other features in a round-robin fashion. :issue:`8478` by :user:`Sergey Feldman `. +- |Enhancement| :class:`impute.IterativeImputer` now samples from a truncated normal + distribution instead of a clipped normal distribution when ``sample_posterior=True``. + :issue:`12177` by :user:`Ben Lawson `. + Multiple modules ................ diff --git a/sklearn/impute.py b/sklearn/impute.py index 3035040c1179a..e6b0181251071 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -651,15 +651,19 @@ def _impute_one_feature(self, good_sigmas = sigmas > 0 imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) imputed_values[~good_sigmas] = mus[~good_sigmas] - imputed_values[good_sigmas] = self.random_state_.normal( - loc=mus[good_sigmas], scale=sigmas[good_sigmas]) + a = (self._min_value - mus) / sigmas + b = (self._max_value - mus) / sigmas + truncated_normal = stats.truncnorm(a=a, + b=b, + loc=mus[good_sigmas], + scale=sigmas[good_sigmas]) + imputed_values[good_sigmas] = truncated_normal.rvs( + random_state=self.random_state_) else: imputed_values = predictor.predict(X_test) - - # clip the values - imputed_values = np.clip(imputed_values, - self._min_value, - self._max_value) + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) # update the feature X_filled[missing_row_mask, feat_idx] = imputed_values @@ -869,8 +873,8 @@ def fit_transform(self, X, y=None): if hasattr(self._predictor, 'random_state'): self._predictor.random_state = self.random_state_ - self._min_value = np.nan if self.min_value is None else self.min_value - self._max_value = np.nan if self.max_value is None else self.max_value + self._min_value = -np.inf if self.min_value is None else self.min_value + self._max_value = np.inf if self.max_value is None else self.max_value self.initial_imputer_ = None X, Xt, mask_missing_values = self._initial_imputation(X) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index dd246cc3e8c4d..029864164a69f 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -4,6 +4,7 @@ import numpy as np from scipy import sparse +from scipy.stats import kstest import io @@ -591,6 +592,39 @@ def test_iterative_imputer_clip(): assert_allclose(Xt[X != 0], X[X != 0]) +def test_iterative_imputer_truncated_normal_posterior(): + # test that the values that are imputed using `sample_posterior=True` + # with boundaries (`min_value` and `max_value` are not None) are drawn + # from a distribution that looks gaussian via the Kolmogorov Smirnov test + pytest.importorskip("scipy", minversion="0.17.0") + rng = np.random.RandomState(0) + + X = rng.normal(size=(5, 5)) + X[0][0] = np.nan + + imputer = IterativeImputer(min_value=0, + max_value=0.5, + sample_posterior=True, + random_state=rng) + + imputer.fit_transform(X) + # generate multiple imputations for the single missing value + imputations = np.array([imputer.transform(X)[0][0] for _ in range(1000)]) + + assert all(imputations >= 0) + assert all(imputations <= 0.5) + + mu, sigma = imputations.mean(), imputations.std() + ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') + if sigma == 0: + sigma += 1e-12 + ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') + # we want to fail to reject null hypothesis + # null hypothesis: distributions are the same + assert ks_statistic < 0.2 or p_value > 0.1, \ + "The posterior does appear to be normal" + + @pytest.mark.parametrize( "strategy", ["mean", "median", "most_frequent"] @@ -619,6 +653,7 @@ def test_iterative_imputer_missing_at_transform(strategy): def test_iterative_imputer_transform_stochasticity(): + pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 @@ -761,6 +796,7 @@ def test_iterative_imputer_error_param(): with pytest.raises(ValueError, match='should be a positive integer'): imputer.fit_transform(X) + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), From caa089e79451a14a332f997a03ed7fec9602eaa1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 8 Oct 2018 09:01:06 +1100 Subject: [PATCH 05/20] DOC Merge IterativeImputer what's news --- doc/whats_new/v0.21.rst | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 5344dccdbce28..cb03c7eba027b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -43,14 +43,11 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.impute` ..................... -- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for - imputing missing values by modeling each feature with missing values as a - function of other features in a round-robin fashion. :issue:`8478` by - :user:`Sergey Feldman `. - -- |Enhancement| :class:`impute.IterativeImputer` now samples from a truncated normal - distribution instead of a clipped normal distribution when ``sample_posterior=True``. - :issue:`12177` by :user:`Ben Lawson `. +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy + for imputing missing values by modeling each feature with missing values as a + function of other features in a round-robin fashion. :issue:`8478` and + :issue:`12177` by :user:`Sergey Feldman ` :user:`Ben Lawson + `. :mod:`sklearn.cluster` ...................... From f103c6be7a0848303a2a8cbb8bbc74d37ace3b15 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 17 Jan 2019 09:04:09 +1100 Subject: [PATCH 06/20] Undo changes to v0.20.rst --- doc/whats_new/v0.20.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index e71e6843106a9..acd54575fd9bd 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -54,6 +54,7 @@ random sampling procedures. Changelog --------- + :mod:`sklearn.compose` ...................... From 9e106580a7147a0a65ce142f84700a278ebcad9a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 17 Jan 2019 09:07:34 +1100 Subject: [PATCH 07/20] Revert changes to v0.20.rst --- doc/whats_new/v0.20.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index acd54575fd9bd..b4e2d65c0ad87 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -54,7 +54,6 @@ random sampling procedures. Changelog --------- - :mod:`sklearn.compose` ...................... @@ -495,6 +494,7 @@ Support for Python 3.3 has been officially dropped. :class:`cluster.AgglomerativeClustering`. :issue:`9875` by :user:`Kumar Ashutosh `. + :mod:`sklearn.compose` ...................... From 0aab6dc73f5284057233bfd91f1c39f80db1ddd9 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 17 Jan 2019 10:56:38 +1100 Subject: [PATCH 08/20] DOC Normalize whitespace in doctest --- doc/modules/impute.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index a8164f27efbea..62cbe74c4ea97 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -106,11 +106,11 @@ round are returned. >>> import numpy as np >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(n_iter=10, random_state=0) - >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) + >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE IterativeImputer(imputation_order='ascending', initial_strategy='mean', - max_value=None, min_value=None, missing_values=nan, n_iter=10, - n_nearest_features=None, predictor=None, random_state=0, - sample_posterior=False, verbose=False) + max_value=None, min_value=None, missing_values=nan, n_iter=10, + n_nearest_features=None, predictor=None, random_state=0, + sample_posterior=False, verbose=False) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] From d34f2270b73b8c8238789e1084d73f84f3bcc7a6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 17 Jan 2019 13:01:54 +1100 Subject: [PATCH 09/20] Fix for SciPy 0.17 --- sklearn/impute.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 411bc78acec8d..f888d7fe83d4f 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -8,9 +8,11 @@ import warnings import numbers from time import time +from distutils.version import LooseVersion import numpy as np import numpy.ma as ma +import scipy from scipy import sparse from scipy import stats from collections import namedtuple @@ -647,14 +649,24 @@ def _impute_one_feature(self, good_sigmas = sigmas > 0 imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) imputed_values[~good_sigmas] = mus[~good_sigmas] + mus = mus[good_sigmas] + sigmas = sigmas[good_sigmas] a = (self._min_value - mus) / sigmas b = (self._max_value - mus) / sigmas - truncated_normal = stats.truncnorm(a=a, - b=b, - loc=mus[good_sigmas], - scale=sigmas[good_sigmas]) - imputed_values[good_sigmas] = truncated_normal.rvs( - random_state=self.random_state_) + + if scipy.__version__ < LooseVersion('0.18'): + # bug with vector-valued `a` in old scipy + imputed_values[good_sigmas] = [ + stats.truncnorm(a=a_, b=b_, + loc=loc_, scale=scale_).rvs( + random_state=self.random_state_) + for a_, b_, loc_, scale_ + in zip(a, b, mus, sigmas)] + else: + truncated_normal = stats.truncnorm(a=a, b=b, + loc=mus, scale=sigmas) + imputed_values[good_sigmas] = truncated_normal.rvs( + random_state=self.random_state_) else: imputed_values = predictor.predict(X_test) imputed_values = np.clip(imputed_values, From b44dff8a086f37a960c8219ccd4694d1c87632eb Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 17 Jan 2019 18:05:05 +1100 Subject: [PATCH 10/20] Fix doctest --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 62cbe74c4ea97..3b029c4d15751 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -115,7 +115,7 @@ round are returned. >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] [ 6. 3.] - [24. 6.]] + [26. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From 0453c19ede55a69f8c706c6e251cc2cd7fdb75a2 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 13:46:01 +1100 Subject: [PATCH 11/20] Create examples/impute gallery --- build_tools/circle/build_doc.sh | 2 +- doc/modules/impute.rst | 2 +- examples/{ => impute}/plot_missing_values.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename examples/{ => impute}/plot_missing_values.py (100%) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index d32f7b9000b95..363148817c61c 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -143,7 +143,7 @@ cd - set +o pipefail affected_doc_paths() { - files=$(git diff --name-only origin/master...$CIRCLE_SHA1) + files=$(git diff --name-only origin/"$CIRCLE_BRANCH"...$CIRCLE_SHA1) echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/' echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/' sklearn_files=$(echo "$files" | grep '^sklearn/') diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 3b029c4d15751..1d1f6e926e8f8 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -119,7 +119,7 @@ round are returned. Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. -See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. .. _multiple_imputation: diff --git a/examples/plot_missing_values.py b/examples/impute/plot_missing_values.py similarity index 100% rename from examples/plot_missing_values.py rename to examples/impute/plot_missing_values.py From 87585614aac4c7ab1a1e45fefc093eb805a2653e Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 15:58:13 +1100 Subject: [PATCH 12/20] Add missing readme file --- examples/impute/README.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 examples/impute/README.txt diff --git a/examples/impute/README.txt b/examples/impute/README.txt new file mode 100644 index 0000000000000..e42264caf9087 --- /dev/null +++ b/examples/impute/README.txt @@ -0,0 +1,6 @@ +.. _impute_examples: + +Missing Value Imputation +------------------------ + +Examples concerning the :mod:`sklearn.impute` module. From f4d970ec9ef287d945e2cf16317b08b897738172 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 22 Jan 2019 15:58:46 +1100 Subject: [PATCH 13/20] Undo change to circle build --- build_tools/circle/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 363148817c61c..d32f7b9000b95 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -143,7 +143,7 @@ cd - set +o pipefail affected_doc_paths() { - files=$(git diff --name-only origin/"$CIRCLE_BRANCH"...$CIRCLE_SHA1) + files=$(git diff --name-only origin/master...$CIRCLE_SHA1) echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/' echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/' sklearn_files=$(echo "$files" | grep '^sklearn/') From 34b7a4648358c2d7a10095c439d6bde7983d6d83 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 24 Jan 2019 09:56:45 +1100 Subject: [PATCH 14/20] DOC Make IterativeImputer doctest more stable (#13026) --- doc/modules/impute.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 1d1f6e926e8f8..45523d74fe9b8 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -106,16 +106,17 @@ round are returned. >>> import numpy as np >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(n_iter=10, random_state=0) - >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE + >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, sample_posterior=False, verbose=False) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] + >>> # the model learns that the second feature is double the first >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] - [ 6. 3.] - [26. 6.]] + [ 6. 12.] + [ 3. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From b58bd0b1ef7c6fd4bd5b4947bdbf2a91b75bb9d1 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 24 Jan 2019 12:08:10 +1100 Subject: [PATCH 15/20] TST IterativeImputer: Check predictor type (#13039) --- sklearn/tests/test_impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index fd2bbd4ec5ad0..3f347edd00e3e 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -565,7 +565,7 @@ def test_iterative_imputer_predictors(predictor): # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: - assert triplet.predictor + assert isinstance(triplet.predictor, type(predictor)) hashes.append(id(triplet.predictor)) # check that each predictor is unique From cf4670c23ae00725e6efa8c0283311ccb631e28e Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Thu, 24 Jan 2019 08:59:07 -0800 Subject: [PATCH 16/20] EHN: Changing default model for IterativeImputer to BayesianRidge (#13038) --- sklearn/impute.py | 16 ++++------------ sklearn/tests/test_impute.py | 6 ++++-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index f888d7fe83d4f..6dfce49f7b1f2 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -455,13 +455,10 @@ class IterativeImputer(BaseEstimator, TransformerMixin): computed during the final round. A round is a single imputation of each feature with missing values. - predictor : estimator object, default=RidgeCV() or BayesianRidge() + predictor : estimator object, default=BayesianRidge() The predictor to use at each step of the round-robin imputation. If ``sample_posterior`` is True, the predictor must support - ``return_std`` in its ``predict`` method. Also, if - ``sample_posterior=True`` the default predictor will be - :class:`sklearn.linear_model.BayesianRidge` and - :class:`sklearn.linear_model.RidgeCV` otherwise. + ``return_std`` in its ``predict`` method. sample_posterior : boolean, default=False Whether to sample from the (Gaussian) predictive posterior of the @@ -868,13 +865,8 @@ def fit_transform(self, X, y=None): .format(self.n_iter)) if self.predictor is None: - if self.sample_posterior: - from .linear_model import BayesianRidge - self._predictor = BayesianRidge() - else: - from .linear_model import RidgeCV - # including a very small alpha to approximate OLS - self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1, 1, 10])) + from .linear_model import BayesianRidge + self._predictor = BayesianRidge() else: self._predictor = clone(self.predictor) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 3f347edd00e3e..a2bf8d75ef9e5 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -547,7 +547,7 @@ def test_iterative_imputer_imputation_order(imputation_order): @pytest.mark.parametrize( "predictor", - [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] + [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] ) def test_iterative_imputer_predictors(predictor): rng = np.random.RandomState(0) @@ -565,7 +565,9 @@ def test_iterative_imputer_predictors(predictor): # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: - assert isinstance(triplet.predictor, type(predictor)) + expected_type = (type(predictor) if predictor is not None + else type(BayesianRidge())) + assert isinstance(triplet.predictor, expected_type) hashes.append(id(triplet.predictor)) # check that each predictor is unique From dc304a4e16ff782eec52ee036c88d91437c740a2 Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Thu, 24 Jan 2019 18:50:52 -0800 Subject: [PATCH 17/20] EXA Add IterativeImputer extended example (#12100) --- doc/modules/impute.rst | 70 ++++++---- ...t_iterative_imputer_variants_comparison.py | 126 ++++++++++++++++++ examples/impute/plot_missing_values.py | 58 ++++---- sklearn/impute.py | 2 +- 4 files changed, 200 insertions(+), 56 deletions(-) create mode 100644 examples/impute/plot_iterative_imputer_variants_comparison.py diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 45523d74fe9b8..1db20e9c6dcdb 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -9,19 +9,19 @@ Imputation of missing values For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an -array are numerical, and that all have and hold meaning. A basic strategy to use -incomplete datasets is to discard entire rows and/or columns containing missing -values. However, this comes at the price of losing data which may be valuable -(even though incomplete). A better strategy is to impute the missing values, -i.e., to infer them from the known part of the data. See the :ref:`glossary` -entry on imputation. +array are numerical, and that all have and hold meaning. A basic strategy to +use incomplete datasets is to discard entire rows and/or columns containing +missing values. However, this comes at the price of losing data which may be +valuable (even though incomplete). A better strategy is to impute the missing +values, i.e., to infer them from the known part of the data. See the +:ref:`glossary` entry on imputation. Univariate vs. Multivariate Imputation ====================================== -One type of imputation algorithm is univariate, which imputes values in the i-th -feature dimension using only non-missing values in that feature dimension +One type of imputation algorithm is univariate, which imputes values in the +i-th feature dimension using only non-missing values in that feature dimension (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the missing values (e.g. :class:`impute.IterativeImputer`). @@ -66,9 +66,9 @@ The :class:`SimpleImputer` class also supports sparse matrices:: [6. 3.] [7. 6.]] -Note that this format is not meant to be used to implicitly store missing values -in the matrix because it would densify it at transform time. Missing values encoded -by 0 must be used with dense input. +Note that this format is not meant to be used to implicitly store missing +values in the matrix because it would densify it at transform time. Missing +values encoded by 0 must be used with dense input. The :class:`SimpleImputer` class also supports categorical data represented as string values or pandas categoricals when using the ``'most_frequent'`` or @@ -110,7 +110,7 @@ round are returned. IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, - sample_posterior=False, verbose=False) + sample_posterior=False, verbose=0) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> # the model learns that the second feature is double the first >>> print(np.round(imp.transform(X_test))) @@ -118,23 +118,35 @@ round are returned. [ 6. 12.] [ 3. 6.]] -Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline -as a way to build a composite estimator that supports imputation. +Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a +Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. +Flexibility of IterativeImputer +------------------------------- + +There are many well-established imputation packages in the R data science +ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns +out to be a particular instance of different sequential imputation algorithms +that can all be implemented with :class:`IterativeImputer` by passing in +different regressors to be used for predicting missing feature values. In the +case of missForest, this regressor is a Random Forest. +See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. + + .. _multiple_imputation: Multiple vs. Single Imputation -============================== +------------------------------ -In the statistics community, it is common practice to perform multiple imputations, -generating, for example, ``m`` separate imputations for a single feature matrix. -Each of these ``m`` imputations is then put through the subsequent analysis pipeline -(e.g. feature engineering, clustering, regression, classification). The ``m`` final -analysis results (e.g. held-out validation errors) allow the data scientist -to obtain understanding of how analytic results may differ as a consequence -of the inherent uncertainty caused by the missing values. The above practice -is called multiple imputation. +In the statistics community, it is common practice to perform multiple +imputations, generating, for example, ``m`` separate imputations for a single +feature matrix. Each of these ``m`` imputations is then put through the +subsequent analysis pipeline (e.g. feature engineering, clustering, regression, +classification). The ``m`` final analysis results (e.g. held-out validation +errors) allow the data scientist to obtain understanding of how analytic +results may differ as a consequence of the inherent uncertainty caused by the +missing values. The above practice is called multiple imputation. Our implementation of :class:`IterativeImputer` was inspired by the R MICE package (Multivariate Imputation by Chained Equations) [1]_, but differs from @@ -144,13 +156,13 @@ it repeatedly to the same dataset with different random seeds when ``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple vs. single imputations. -It is still an open problem as to how useful single vs. multiple imputation is in -the context of prediction and classification when the user is not interested in -measuring uncertainty due to missing values. +It is still an open problem as to how useful single vs. multiple imputation is +in the context of prediction and classification when the user is not +interested in measuring uncertainty due to missing values. -Note that a call to the ``transform`` method of :class:`IterativeImputer` is not -allowed to change the number of samples. Therefore multiple imputations cannot be -achieved by a single call to ``transform``. +Note that a call to the ``transform`` method of :class:`IterativeImputer` is +not allowed to change the number of samples. Therefore multiple imputations +cannot be achieved by a single call to ``transform``. References ========== diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py new file mode 100644 index 0000000000000..a850deb273f24 --- /dev/null +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -0,0 +1,126 @@ +""" +========================================================= +Imputing missing values with variants of IterativeImputer +========================================================= + +The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be +used with a variety of predictors to do round-robin regression, treating every +variable as an output in turn. + +In this example we compare some predictors for the purpose of missing feature +imputation with :class:`sklearn.imputeIterativeImputer`:: + + :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression + :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression + :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R + :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN + imputation approaches + +Of particular interest is the ability of +:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a +popular imputation package for R. In this example, we have chosen to use +:class:`sklearn.ensemble.ExtraTreesRegressor` instead of +:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its +increased speed. + +Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN +imputation, which learns from samples with missing values by using a distance +metric that accounts for missing values, rather than imputing them. + +The goal is to compare different predictors to see which one is best for the +:class:`sklearn.impute.IterativeImputer` when using a +:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing +dataset with a single value randomly removed from each row. + +For this particular pattern of missing values we see that +:class:`sklearn.ensemble.ExtraTreesRegressor` and +:class:`sklearn.linear_model.BayesianRidge` give the best results. +""" +print(__doc__) + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.datasets import fetch_california_housing +from sklearn.impute import SimpleImputer +from sklearn.impute import IterativeImputer +from sklearn.linear_model import BayesianRidge +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.neighbors import KNeighborsRegressor +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import cross_val_score + +N_SPLITS = 5 + +rng = np.random.RandomState(0) + +X_full, y_full = fetch_california_housing(return_X_y=True) +n_samples, n_features = X_full.shape + +# Estimate the score on the entire dataset, with no missing values +br_estimator = BayesianRidge() +score_full_data = pd.DataFrame( + cross_val_score( + br_estimator, X_full, y_full, scoring='neg_mean_squared_error', + cv=N_SPLITS + ), + columns=['Full Data'] +) + +# Add a single missing value to each row +X_missing = X_full.copy() +y_missing = y_full +missing_samples = np.arange(n_samples) +missing_features = rng.choice(n_features, n_samples, replace=True) +X_missing[missing_samples, missing_features] = np.nan + +# Estimate the score after imputation (mean and median strategies) +score_simple_imputer = pd.DataFrame() +for strategy in ('mean', 'median'): + estimator = make_pipeline( + SimpleImputer(missing_values=np.nan, strategy=strategy), + br_estimator + ) + score_simple_imputer[strategy] = cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) + +# Estimate the score after iterative imputation of the missing values +# with different predictors +predictors = [ + BayesianRidge(), + DecisionTreeRegressor(max_features='sqrt', random_state=0), + ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0), + KNeighborsRegressor(n_neighbors=15) +] +score_iterative_imputer = pd.DataFrame() +for predictor in predictors: + estimator = make_pipeline( + IterativeImputer(random_state=0, predictor=predictor), + br_estimator + ) + score_iterative_imputer[predictor.__class__.__name__] = \ + cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) + +scores = pd.concat( + [score_full_data, score_simple_imputer, score_iterative_imputer], + keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 +) + +# plot boston results +fig, ax = plt.subplots(figsize=(13, 6)) +means = -scores.mean() +errors = scores.std() +means.plot.barh(xerr=errors, ax=ax) +ax.set_title('California Housing Regression with Different Imputation Methods') +ax.set_xlabel('MSE (smaller is better)') +ax.set_yticks(np.arange(means.shape[0])) +ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()]) +plt.tight_layout(pad=1) +plt.show() diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 43d7ddfc497f3..897b66aad246c 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -12,12 +12,13 @@ round-robin linear regression, treating every variable as an output in turn. The version implemented assumes Gaussian (output) variables. If your features are obviously non-Normal, consider transforming them to look more -Normal so as to improve performance. +Normal so as to potentially improve performance. In addition of using an imputing method, we can also keep an indication of the missing information using :func:`sklearn.impute.MissingIndicator` which might carry some information. """ +print(__doc__) import numpy as np import matplotlib.pyplot as plt @@ -31,6 +32,19 @@ rng = np.random.RandomState(0) +N_SPLITS = 5 +REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100) + + +def get_scores_for_imputer(imputer, X_missing, y_missing): + estimator = make_pipeline( + make_union(imputer, MissingIndicator(missing_values=0)), + REGRESSOR) + impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=N_SPLITS) + return impute_scores + def get_results(dataset): X_full, y_full = dataset.data, dataset.target @@ -38,9 +52,9 @@ def get_results(dataset): n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - full_scores = cross_val_score(estimator, X_full, y_full, - scoring='neg_mean_squared_error', cv=5) + full_scores = cross_val_score(REGRESSOR, X_full, y_full, + scoring='neg_mean_squared_error', + cv=N_SPLITS) # Add missing values in 75% of the lines missing_rate = 0.75 @@ -51,35 +65,27 @@ def get_results(dataset): dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) - - # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + + # Estimate the score after replacing missing values by 0 + imputer = SimpleImputer(missing_values=0, + strategy='constant', + fill_value=0) + zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after imputation (mean strategy) of the missing values - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - estimator = make_pipeline( - make_union(SimpleImputer(missing_values=0, strategy="mean"), - MissingIndicator(missing_values=0)), - RandomForestRegressor(random_state=0, n_estimators=100)) - mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + imputer = SimpleImputer(missing_values=0, strategy="mean") + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after iterative imputation of the missing values - estimator = make_pipeline( - make_union(IterativeImputer(missing_values=0, random_state=0), - MissingIndicator(missing_values=0)), - RandomForestRegressor(random_state=0, n_estimators=100)) - iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error') + imputer = IterativeImputer(missing_values=0, + random_state=0, + n_nearest_features=5) + iterative_impute_scores = get_scores_for_imputer(imputer, + X_missing, + y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), diff --git a/sklearn/impute.py b/sklearn/impute.py index 6dfce49f7b1f2..ef4e552260e05 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -556,7 +556,7 @@ def __init__(self, initial_strategy="mean", min_value=None, max_value=None, - verbose=False, + verbose=0, random_state=None): self.missing_values = missing_values From 92e731606a43b5594f5c4a2a2f32fe9b8648cc38 Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Tue, 12 Feb 2019 12:54:30 -0800 Subject: [PATCH 18/20] ENH IterativeImputer: n_iter->max_iter (#13061) --- doc/modules/impute.rst | 14 +- ...t_iterative_imputer_variants_comparison.py | 16 +- sklearn/impute.py | 217 +++++++++++------- sklearn/tests/test_impute.py | 183 ++++++++++++--- 4 files changed, 294 insertions(+), 136 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 1db20e9c6dcdb..6de5df8b12729 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -100,17 +100,17 @@ fashion: at each step, a feature column is designated as output ``y`` and the other feature columns are treated as inputs ``X``. A regressor is fit on ``(X, y)`` for known ``y``. Then, the regressor is used to predict the missing values of ``y``. This is done for each feature in an iterative fashion, and then is -repeated for ``n_iter`` imputation rounds. The results of the final imputation -round are returned. +repeated for ``max_iter`` imputation rounds. The results of the final +imputation round are returned. >>> import numpy as np >>> from sklearn.impute import IterativeImputer - >>> imp = IterativeImputer(n_iter=10, random_state=0) + >>> imp = IterativeImputer(max_iter=10, random_state=0) >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE - IterativeImputer(imputation_order='ascending', initial_strategy='mean', - max_value=None, min_value=None, missing_values=nan, n_iter=10, - n_nearest_features=None, predictor=None, random_state=0, - sample_posterior=False, verbose=0) + IterativeImputer(estimator=None, imputation_order='ascending', + initial_strategy='mean', max_iter=10, max_value=None, + min_value=None, missing_values=nan, n_nearest_features=None, + random_state=0, sample_posterior=False, tol=0.001, verbose=0) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> # the model learns that the second feature is double the first >>> print(np.round(imp.transform(X_test))) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index a850deb273f24..77a12e87a1e8a 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -4,10 +4,10 @@ ========================================================= The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be -used with a variety of predictors to do round-robin regression, treating every +used with a variety of estimators to do round-robin regression, treating every variable as an output in turn. -In this example we compare some predictors for the purpose of missing feature +In this example we compare some estimators for the purpose of missing feature imputation with :class:`sklearn.imputeIterativeImputer`:: :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression @@ -27,7 +27,7 @@ imputation, which learns from samples with missing values by using a distance metric that accounts for missing values, rather than imputing them. -The goal is to compare different predictors to see which one is best for the +The goal is to compare different estimators to see which one is best for the :class:`sklearn.impute.IterativeImputer` when using a :class:`sklearn.linear_model.BayesianRidge` estimator on the California housing dataset with a single value randomly removed from each row. @@ -89,20 +89,20 @@ ) # Estimate the score after iterative imputation of the missing values -# with different predictors -predictors = [ +# with different estimators +estimators = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0), KNeighborsRegressor(n_neighbors=15) ] score_iterative_imputer = pd.DataFrame() -for predictor in predictors: +for estimator in estimators: estimator = make_pipeline( - IterativeImputer(random_state=0, predictor=predictor), + IterativeImputer(random_state=0, estimator=estimator), br_estimator ) - score_iterative_imputer[predictor.__class__.__name__] = \ + score_iterative_imputer[estimator.__class__.__name__] = \ cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS diff --git a/sklearn/impute.py b/sklearn/impute.py index ef4e552260e05..c8ae55ffa318c 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -19,6 +19,7 @@ from .base import BaseEstimator, TransformerMixin from .base import clone +from .exceptions import ConvergenceWarning from .preprocessing import normalize from .utils import check_array, check_random_state, safe_indexing from .utils.sparsefuncs import _get_median @@ -30,7 +31,7 @@ ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', 'neighbor_feat_idx', - 'predictor']) + 'estimator']) __all__ = [ 'MissingIndicator', @@ -436,36 +437,28 @@ class IterativeImputer(BaseEstimator, TransformerMixin): The placeholder for the missing values. All occurrences of ``missing_values`` will be imputed. - imputation_order : str, optional (default="ascending") - The order in which the features will be imputed. Possible values: - - "ascending" - From features with fewest missing values to most. - "descending" - From features with most missing values to fewest. - "roman" - Left to right. - "arabic" - Right to left. - "random" - A random order for each round. - - n_iter : int, optional (default=10) - Number of imputation rounds to perform before returning the imputations - computed during the final round. A round is a single imputation of each - feature with missing values. - - predictor : estimator object, default=BayesianRidge() - The predictor to use at each step of the round-robin imputation. - If ``sample_posterior`` is True, the predictor must support + estimator : estimator object, default=BayesianRidge() + The estimator to use at each step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. sample_posterior : boolean, default=False Whether to sample from the (Gaussian) predictive posterior of the - fitted predictor for each imputation. Predictor must support + fitted estimator for each imputation. Estimator must support ``return_std`` in its ``predict`` method if set to ``True``. Set to ``True`` if using ``IterativeImputer`` for multiple imputations. + max_iter : int, optional (default=10) + Maximum number of imputation rounds to perform before returning the + imputations computed during the final round. A round is a single + imputation of each feature with missing values. The stopping criterion + is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, + where `X_t` is `X` at iteration `t. Note that early stopping is only + applied if ``sample_posterior=False``. + + tol : float, optional (default=1e-3) + Tolerance of the stopping condition. + n_nearest_features : int, optional (default=None) Number of other features to use to estimate the missing values of each feature column. Nearness between features is measured using @@ -481,6 +474,20 @@ class IterativeImputer(BaseEstimator, TransformerMixin): ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` Valid values: {"mean", "median", "most_frequent", or "constant"}. + imputation_order : str, optional (default="ascending") + The order in which the features will be imputed. Possible values: + + "ascending" + From features with fewest missing values to most. + "descending" + From features with most missing values to fewest. + "roman" + Left to right. + "arabic" + Right to left. + "random" + A random order for each round. + min_value : float, optional (default=None) Minimum possible imputed value. Default of ``None`` will set minimum to negative infinity. @@ -496,7 +503,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin): random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use. Randomizes - selection of predictor features if n_nearest_features is not None, the + selection of estimator features if n_nearest_features is not None, the ``imputation_order`` if ``random``, and the sampling from posterior if ``sample_posterior`` is True. Use an integer for determinism. See :term:`the Glossary `. @@ -507,11 +514,16 @@ class IterativeImputer(BaseEstimator, TransformerMixin): Imputer used to initialize the missing values. imputation_sequence_ : list of tuples - Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where + Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the - current feature, and ``predictor`` is the trained predictor used for - the imputation. Length is ``self.n_features_with_missing_ * n_iter``. + current feature, and ``estimator`` is the trained estimator used for + the imputation. Length is ``self.n_features_with_missing_ * + self.n_iter_``. + + n_iter_ : int + Number of iteration rounds that occurred. Will be less than + ``self.max_iter`` if early stopping criterion was reached. n_features_with_missing_ : int Number of features with missing values. @@ -522,7 +534,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin): Notes ----- - To support imputation in inductive mode we store each feature's predictor + To support imputation in inductive mode we store each feature's estimator during the ``fit`` phase, and predict without refitting (in order) during the ``transform`` phase. @@ -548,24 +560,26 @@ class IterativeImputer(BaseEstimator, TransformerMixin): def __init__(self, missing_values=np.nan, - imputation_order='ascending', - n_iter=10, - predictor=None, + estimator=None, sample_posterior=False, + max_iter=10, + tol=1e-3, n_nearest_features=None, initial_strategy="mean", + imputation_order='ascending', min_value=None, max_value=None, verbose=0, random_state=None): self.missing_values = missing_values - self.imputation_order = imputation_order - self.n_iter = n_iter - self.predictor = predictor + self.estimator = estimator self.sample_posterior = sample_posterior + self.max_iter = max_iter + self.tol = tol self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy + self.imputation_order = imputation_order self.min_value = min_value self.max_value = max_value self.verbose = verbose @@ -576,12 +590,12 @@ def _impute_one_feature(self, mask_missing_values, feat_idx, neighbor_feat_idx, - predictor=None, + estimator=None, fit_mode=True): """Impute a single feature from the others provided. This function predicts the missing values of one of the features using - the current estimates of all the other features. The ``predictor`` must + the current estimates of all the other features. The ``estimator`` must support ``return_std=True`` in its ``predict`` method for this function to work. @@ -599,22 +613,22 @@ def _impute_one_feature(self, neighbor_feat_idx : ndarray Indices of the features to be used in imputing ``feat_idx``. - predictor : object - The predictor to use at this step of the round-robin imputation. - If ``sample_posterior`` is True, the predictor must support + estimator : object + The estimator to use at this step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. - If None, it will be cloned from self._predictor. + If None, it will be cloned from self._estimator. fit_mode : boolean, default=True - Whether to fit and predict with the predictor or just predict. + Whether to fit and predict with the estimator or just predict. Returns ------- X_filled : ndarray Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. - predictor : predictor with sklearn API - The fitted predictor used to impute + estimator : estimator with sklearn API + The fitted estimator used to impute ``X_filled[missing_row_mask, feat_idx]``. """ @@ -622,38 +636,46 @@ def _impute_one_feature(self, # (should not happen at fit time because feat_ids would be excluded) missing_row_mask = mask_missing_values[:, feat_idx] if not np.any(missing_row_mask): - return X_filled, predictor + return X_filled, estimator - if predictor is None and fit_mode is False: + if estimator is None and fit_mode is False: raise ValueError("If fit_mode is False, then an already-fitted " - "predictor should be passed in.") + "estimator should be passed in.") - if predictor is None: - predictor = clone(self._predictor) + if estimator is None: + estimator = clone(self._estimator) if fit_mode: X_train = safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask) y_train = safe_indexing(X_filled[:, feat_idx], ~missing_row_mask) - predictor.fit(X_train, y_train) + estimator.fit(X_train, y_train) # get posterior samples X_test = safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) if self.sample_posterior: - mus, sigmas = predictor.predict(X_test, return_std=True) - good_sigmas = sigmas > 0 + mus, sigmas = estimator.predict(X_test, return_std=True) imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - imputed_values[~good_sigmas] = mus[~good_sigmas] - mus = mus[good_sigmas] - sigmas = sigmas[good_sigmas] + # two types of problems: (1) non-positive sigmas, (2) mus outside + # legal range of min_value and max_value (results in inf sample) + positive_sigmas = sigmas > 0 + imputed_values[~positive_sigmas] = mus[~positive_sigmas] + mus_too_low = mus < self._min_value + imputed_values[mus_too_low] = self._min_value + mus_too_high = mus > self._max_value + imputed_values[mus_too_high] = self._max_value + # the rest can be sampled without statistical issues + inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high + mus = mus[inrange_mask] + sigmas = sigmas[inrange_mask] a = (self._min_value - mus) / sigmas b = (self._max_value - mus) / sigmas if scipy.__version__ < LooseVersion('0.18'): # bug with vector-valued `a` in old scipy - imputed_values[good_sigmas] = [ + imputed_values[inrange_mask] = [ stats.truncnorm(a=a_, b=b_, loc=loc_, scale=scale_).rvs( random_state=self.random_state_) @@ -662,17 +684,17 @@ def _impute_one_feature(self, else: truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas) - imputed_values[good_sigmas] = truncated_normal.rvs( + imputed_values[inrange_mask] = truncated_normal.rvs( random_state=self.random_state_) else: - imputed_values = predictor.predict(X_test) + imputed_values = estimator.predict(X_test) imputed_values = np.clip(imputed_values, self._min_value, self._max_value) # update the feature X_filled[missing_row_mask, feat_idx] = imputed_values - return X_filled, predictor + return X_filled, estimator def _get_neighbor_feat_idx(self, n_features, @@ -859,19 +881,27 @@ def fit_transform(self, X, y=None): self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) - if self.n_iter < 0: + if self.max_iter < 0: raise ValueError( - "'n_iter' should be a positive integer. Got {} instead." - .format(self.n_iter)) + "'max_iter' should be a positive integer. Got {} instead." + .format(self.max_iter)) + + if self.tol < 0: + raise ValueError( + "'tol' should be a non-negative float. Got {} instead." + .format(self.tol) + ) - if self.predictor is None: + if self.estimator is None: from .linear_model import BayesianRidge - self._predictor = BayesianRidge() + self._estimator = BayesianRidge() else: - self._predictor = clone(self.predictor) + self._estimator = clone(self.estimator) - if hasattr(self._predictor, 'random_state'): - self._predictor.random_state = self.random_state_ + self.imputation_sequence_ = [] + + if hasattr(self._estimator, 'random_state'): + self._estimator.random_state = self.random_state_ self._min_value = -np.inf if self.min_value is None else self.min_value self._max_value = np.inf if self.max_value is None else self.max_value @@ -879,7 +909,8 @@ def fit_transform(self, X, y=None): self.initial_imputer_ = None X, Xt, mask_missing_values = self._initial_imputation(X) - if self.n_iter == 0: + if self.max_iter == 0 or np.all(mask_missing_values): + self.n_iter_ = 0 return Xt # order in which to impute @@ -891,14 +922,15 @@ def fit_transform(self, X, y=None): abs_corr_mat = self._get_abs_corr_mat(Xt) - # impute data n_samples, n_features = Xt.shape - self.imputation_sequence_ = [] if self.verbose > 0: print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for i_rnd in range(self.n_iter): + if not self.sample_posterior: + Xt_previous = Xt.copy() + normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) + for self.n_iter_ in range(1, self.max_iter + 1): if self.imputation_order == 'random': ordered_idx = self._get_ordered_idx(mask_missing_values) @@ -906,19 +938,32 @@ def fit_transform(self, X, y=None): neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat) - Xt, predictor = self._impute_one_feature( + Xt, estimator = self._impute_one_feature( Xt, mask_missing_values, feat_idx, neighbor_feat_idx, - predictor=None, fit_mode=True) - predictor_triplet = ImputerTriplet(feat_idx, + estimator=None, fit_mode=True) + estimator_triplet = ImputerTriplet(feat_idx, neighbor_feat_idx, - predictor) - self.imputation_sequence_.append(predictor_triplet) + estimator) + self.imputation_sequence_.append(estimator_triplet) - if self.verbose > 0: + if self.verbose > 1: print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter, time() - start_t)) - + % (self.n_iter_, self.max_iter, time() - start_t)) + + if not self.sample_posterior: + inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, + axis=None) + if inf_norm < normalized_tol: + if self.verbose > 0: + print('[IterativeImputer] Early stopping criterion ' + 'reached.') + break + Xt_previous = Xt.copy() + else: + if not self.sample_posterior: + warnings.warn("[IterativeImputer] Early stopping criterion not" + " reached.", ConvergenceWarning) Xt[~mask_missing_values] = X[~mask_missing_values] return Xt @@ -942,29 +987,29 @@ def transform(self, X): X, Xt, mask_missing_values = self._initial_imputation(X) - if self.n_iter == 0: + if self.n_iter_ == 0 or np.all(mask_missing_values): return Xt - imputations_per_round = len(self.imputation_sequence_) // self.n_iter + imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ i_rnd = 0 if self.verbose > 0: print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) start_t = time() - for it, predictor_triplet in enumerate(self.imputation_sequence_): + for it, estimator_triplet in enumerate(self.imputation_sequence_): Xt, _ = self._impute_one_feature( Xt, mask_missing_values, - predictor_triplet.feat_idx, - predictor_triplet.neighbor_feat_idx, - predictor=predictor_triplet.predictor, + estimator_triplet.feat_idx, + estimator_triplet.neighbor_feat_idx, + estimator=estimator_triplet.estimator, fit_mode=False ) if not (it + 1) % imputations_per_round: if self.verbose > 1: print('[IterativeImputer] Ending imputation round ' '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter, time() - start_t)) + % (i_rnd + 1, self.n_iter_, time() - start_t)) i_rnd += 1 Xt[~mask_missing_values] = X[~mask_missing_values] diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index a2bf8d75ef9e5..9d063dd33bec2 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -508,6 +508,55 @@ def test_imputation_copy(): # made, even if copy=False. +def test_iterative_imputer_zero_iters(): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + missing_flag = X == 0 + X[missing_flag] = np.nan + + imputer = IterativeImputer(max_iter=0) + X_imputed = imputer.fit_transform(X) + # with max_iter=0, only initial imputation is performed + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + # repeat but force n_iter_ to 0 + imputer = IterativeImputer(max_iter=5).fit(X) + # transformed should not be equal to initial imputation + assert not np.all(imputer.transform(X) == + imputer.initial_imputer_.transform(X)) + + imputer.n_iter_ = 0 + # now they should be equal as only initial imputation is done + assert_allclose(imputer.transform(X), + imputer.initial_imputer_.transform(X)) + + +def test_iterative_imputer_verbose(): + rng = np.random.RandomState(0) + + n = 100 + d = 3 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) + imputer.fit(X) + imputer.transform(X) + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) + imputer.fit(X) + imputer.transform(X) + + +def test_iterative_imputer_all_missing(): + n = 100 + d = 3 + X = np.zeros((n, d)) + imputer = IterativeImputer(missing_values=0, max_iter=1) + X_imputed = imputer.fit_transform(X) + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + @pytest.mark.parametrize( "imputation_order", ['random', 'roman', 'ascending', 'descending', 'arabic'] @@ -516,22 +565,24 @@ def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 + max_iter = 2 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer - n_iter = 2 imputer = IterativeImputer(missing_values=0, - n_iter=n_iter, + max_iter=max_iter, n_nearest_features=5, + sample_posterior=False, min_value=0, max_value=1, - verbose=False, + verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] - assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_ + assert (len(ordered_idx) // imputer.n_iter_ == + imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) @@ -542,14 +593,14 @@ def test_iterative_imputer_imputation_order(imputation_order): ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: - assert len(ordered_idx) == n_iter * (d - 1) + assert len(ordered_idx) == max_iter * (d - 1) @pytest.mark.parametrize( - "predictor", + "estimator", [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] ) -def test_iterative_imputer_predictors(predictor): +def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 @@ -557,20 +608,20 @@ def test_iterative_imputer_predictors(predictor): X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, - n_iter=1, - predictor=predictor, + max_iter=1, + estimator=estimator, random_state=rng) imputer.fit_transform(X) - # check that types are correct for predictors + # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: - expected_type = (type(predictor) if predictor is not None + expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) - assert isinstance(triplet.predictor, expected_type) - hashes.append(id(triplet.predictor)) + assert isinstance(triplet.estimator, expected_type) + hashes.append(id(triplet.estimator)) - # check that each predictor is unique + # check that each estimator is unique assert len(set(hashes)) == len(hashes) @@ -582,7 +633,7 @@ def test_iterative_imputer_clip(): random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, - n_iter=1, + max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) @@ -593,12 +644,37 @@ def test_iterative_imputer_clip(): assert_allclose(Xt[X != 0], X[X != 0]) +def test_iterative_imputer_clip_truncnorm(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 + + imputer = IterativeImputer(missing_values=0, + max_iter=2, + n_nearest_features=5, + sample_posterior=True, + min_value=0.1, + max_value=0.2, + verbose=1, + imputation_order='random', + random_state=rng) + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn - # from a distribution that looks gaussian via the Kolmogorov Smirnov test + # from a distribution that looks gaussian via the Kolmogorov Smirnov test. + # note that starting from the wrong random seed will make this test fail + # because random sampling doesn't occur at all when the imputation + # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") - rng = np.random.RandomState(0) + rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan @@ -610,7 +686,7 @@ def test_iterative_imputer_truncated_normal_posterior(): imputer.fit_transform(X) # generate multiple imputations for the single missing value - imputations = np.array([imputer.transform(X)[0][0] for _ in range(1000)]) + imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) @@ -641,7 +717,7 @@ def test_iterative_imputer_missing_at_transform(strategy): X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, - n_iter=1, + max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, @@ -664,7 +740,7 @@ def test_iterative_imputer_transform_stochasticity(): # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, - n_iter=1, + max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) @@ -679,14 +755,14 @@ def test_iterative_imputer_transform_stochasticity(): # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, - n_iter=1, + max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, - n_iter=1, + max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', @@ -706,8 +782,8 @@ def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan - m1 = IterativeImputer(n_iter=10, random_state=rng) - m2 = IterativeImputer(n_iter=10, random_state=rng) + m1 = IterativeImputer(max_iter=10, random_state=rng) + m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely @@ -726,7 +802,7 @@ def test_iterative_imputer_rank_one(): X_missing = X.copy() X_missing[nan_mask] = np.nan - imputer = IterativeImputer(n_iter=5, + imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) X_filled = imputer.fit_transform(X_missing) @@ -744,7 +820,6 @@ def test_iterative_imputer_transform_recovery(rank): A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) - # half is randomly missing nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan @@ -755,7 +830,7 @@ def test_iterative_imputer_transform_recovery(rank): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = IterativeImputer(n_iter=10, + imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) @@ -783,21 +858,59 @@ def test_iterative_imputer_additive_matrix(): X_test_filled = X_filled[n:] X_test = X_missing[n:] - imputer = IterativeImputer(n_iter=10, - verbose=2, + imputer = IterativeImputer(max_iter=10, + verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) - assert_allclose(X_test_filled, X_test_est, atol=0.01) + assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01) -def test_iterative_imputer_error_param(): - rng = np.random.RandomState(42) - X = rng.randn(100, 2) - imputer = IterativeImputer(n_iter=-1) - with pytest.raises(ValueError, match='should be a positive integer'): +@pytest.mark.parametrize("max_iter, tol, error_type, warning", [ + (-1, 1e-3, ValueError, 'should be a positive integer'), + (1, -1e-3, ValueError, 'should be a non-negative float') +]) +def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): + X = np.zeros((100, 2)) + imputer = IterativeImputer(max_iter=max_iter, tol=tol) + with pytest.raises(error_type, match=warning): imputer.fit_transform(X) +def test_iterative_imputer_early_stopping(): + rng = np.random.RandomState(0) + n = 50 + d = 5 + A = rng.rand(n, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(max_iter=100, + tol=1e-3, + sample_posterior=False, + verbose=1, + random_state=rng) + X_filled_100 = imputer.fit_transform(X_missing) + assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ + + imputer = IterativeImputer(max_iter=imputer.n_iter_, + sample_posterior=False, + verbose=1, + random_state=rng) + X_filled_early = imputer.fit_transform(X_missing) + assert_allclose(X_filled_100, X_filled_early, atol=1e-7) + + imputer = IterativeImputer(max_iter=100, + tol=0, + sample_posterior=False, + verbose=1, + random_state=rng) + imputer.fit(X_missing) + assert imputer.n_iter_ == imputer.max_iter + + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), From cb3ec846872a6a43d79353b0a3cde06a82662b64 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Feb 2019 08:51:12 +1100 Subject: [PATCH 19/20] pep8 --- sklearn/utils/estimator_checks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 29e8eec5bceb2..6c4196d919aa0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -73,9 +73,10 @@ 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] -ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'IterativeImputer', +ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', - 'PowerTransformer', 'QuantileTransformer'] + 'PowerTransformer', 'QuantileTransformer', 'IterativeImputer'] + SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator'] From c12344051aa76907df195b27f39f4c465a94bdff Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Tue, 12 Feb 2019 21:42:04 -0800 Subject: [PATCH 20/20] API estimator is now first param of IterativeImputer (#13153) --- sklearn/impute.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index b2459a003b16f..3bb0bdd9eff15 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -433,15 +433,15 @@ class IterativeImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : int, np.nan, optional (default=np.nan) - The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. - estimator : estimator object, default=BayesianRidge() The estimator to use at each step of the round-robin imputation. If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. + missing_values : int, np.nan, optional (default=np.nan) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be imputed. + sample_posterior : boolean, default=False Whether to sample from the (Gaussian) predictive posterior of the fitted estimator for each imputation. Estimator must support @@ -559,8 +559,8 @@ class IterativeImputer(BaseEstimator, TransformerMixin): """ def __init__(self, - missing_values=np.nan, estimator=None, + missing_values=np.nan, sample_posterior=False, max_iter=10, tol=1e-3, @@ -572,8 +572,8 @@ def __init__(self, verbose=0, random_state=None): - self.missing_values = missing_values self.estimator = estimator + self.missing_values = missing_values self.sample_posterior = sample_posterior self.max_iter = max_iter self.tol = tol