diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 88201ba8f2ad6..e8fc80644c002 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -655,8 +655,9 @@ Kernels: :template: class.rst impute.SimpleImputer + impute.IterativeImputer impute.MissingIndicator - + .. _kernel_approximation_ref: :mod:`sklearn.kernel_approximation` Kernel Approximation diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 933685f8bfa6f..6de5df8b12729 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -9,12 +9,28 @@ Imputation of missing values For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an -array are numerical, and that all have and hold meaning. A basic strategy to use -incomplete datasets is to discard entire rows and/or columns containing missing -values. However, this comes at the price of losing data which may be valuable -(even though incomplete). A better strategy is to impute the missing values, -i.e., to infer them from the known part of the data. See the :ref:`glossary` -entry on imputation. +array are numerical, and that all have and hold meaning. A basic strategy to +use incomplete datasets is to discard entire rows and/or columns containing +missing values. However, this comes at the price of losing data which may be +valuable (even though incomplete). A better strategy is to impute the missing +values, i.e., to infer them from the known part of the data. See the +:ref:`glossary` entry on imputation. + + +Univariate vs. Multivariate Imputation +====================================== + +One type of imputation algorithm is univariate, which imputes values in the +i-th feature dimension using only non-missing values in that feature dimension +(e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation +algorithms use the entire set of available feature dimensions to estimate the +missing values (e.g. :class:`impute.IterativeImputer`). + + +.. _single_imputer: + +Univariate feature imputation +============================= The :class:`SimpleImputer` class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using @@ -50,9 +66,9 @@ The :class:`SimpleImputer` class also supports sparse matrices:: [6. 3.] [7. 6.]] -Note that this format is not meant to be used to implicitly store missing values -in the matrix because it would densify it at transform time. Missing values encoded -by 0 must be used with dense input. +Note that this format is not meant to be used to implicitly store missing +values in the matrix because it would densify it at transform time. Missing +values encoded by 0 must be used with dense input. The :class:`SimpleImputer` class also supports categorical data represented as string values or pandas categoricals when using the ``'most_frequent'`` or @@ -71,9 +87,92 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] +.. _iterative_imputer: + + +Multivariate feature imputation +=============================== + +A more sophisticated approach is to use the :class:`IterativeImputer` class, +which models each feature with missing values as a function of other features, +and uses that estimate for imputation. It does so in an iterated round-robin +fashion: at each step, a feature column is designated as output ``y`` and the +other feature columns are treated as inputs ``X``. A regressor is fit on ``(X, +y)`` for known ``y``. Then, the regressor is used to predict the missing values +of ``y``. This is done for each feature in an iterative fashion, and then is +repeated for ``max_iter`` imputation rounds. The results of the final +imputation round are returned. + + >>> import numpy as np + >>> from sklearn.impute import IterativeImputer + >>> imp = IterativeImputer(max_iter=10, random_state=0) + >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE + IterativeImputer(estimator=None, imputation_order='ascending', + initial_strategy='mean', max_iter=10, max_value=None, + min_value=None, missing_values=nan, n_nearest_features=None, + random_state=0, sample_posterior=False, tol=0.001, verbose=0) + >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] + >>> # the model learns that the second feature is double the first + >>> print(np.round(imp.transform(X_test))) + [[ 1. 2.] + [ 6. 12.] + [ 3. 6.]] + +Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a +Pipeline as a way to build a composite estimator that supports imputation. +See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + +Flexibility of IterativeImputer +------------------------------- + +There are many well-established imputation packages in the R data science +ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns +out to be a particular instance of different sequential imputation algorithms +that can all be implemented with :class:`IterativeImputer` by passing in +different regressors to be used for predicting missing feature values. In the +case of missForest, this regressor is a Random Forest. +See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. + + +.. _multiple_imputation: + +Multiple vs. Single Imputation +------------------------------ + +In the statistics community, it is common practice to perform multiple +imputations, generating, for example, ``m`` separate imputations for a single +feature matrix. Each of these ``m`` imputations is then put through the +subsequent analysis pipeline (e.g. feature engineering, clustering, regression, +classification). The ``m`` final analysis results (e.g. held-out validation +errors) allow the data scientist to obtain understanding of how analytic +results may differ as a consequence of the inherent uncertainty caused by the +missing values. The above practice is called multiple imputation. + +Our implementation of :class:`IterativeImputer` was inspired by the R MICE +package (Multivariate Imputation by Chained Equations) [1]_, but differs from +it by returning a single imputation instead of multiple imputations. However, +:class:`IterativeImputer` can also be used for multiple imputations by applying +it repeatedly to the same dataset with different random seeds when +``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple +vs. single imputations. + +It is still an open problem as to how useful single vs. multiple imputation is +in the context of prediction and classification when the user is not +interested in measuring uncertainty due to missing values. + +Note that a call to the ``transform`` method of :class:`IterativeImputer` is +not allowed to change the number of samples. Therefore multiple imputations +cannot be achieved by a single call to ``transform``. + +References +========== + +.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate + Imputation by Chained Equations in R". Journal of Statistical Software 45: + 1-67. -:class:`SimpleImputer` can be used in a Pipeline as a way to build a composite -estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis + with Missing Data". John Wiley & Sons, Inc., New York, NY, USA. .. _missing_indicator: diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 3bbd9f3189bd4..4d5abf0083753 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -115,6 +115,15 @@ Support for Python 3.4 and below has been officially dropped. - |API| Deprecated :mod:`externals.six` since we have dropped support for Python 2.7. :issue:`12916` by :user:`Hanmin Qin `. +:mod:`sklearn.impute` +..................... + +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy + for imputing missing values by modeling each feature with missing values as a + function of other features in a round-robin fashion. :issue:`8478` and + :issue:`12177` by :user:`Sergey Feldman ` :user:`Ben Lawson + `. + :mod:`sklearn.linear_model` ........................... diff --git a/examples/impute/README.txt b/examples/impute/README.txt new file mode 100644 index 0000000000000..e42264caf9087 --- /dev/null +++ b/examples/impute/README.txt @@ -0,0 +1,6 @@ +.. _impute_examples: + +Missing Value Imputation +------------------------ + +Examples concerning the :mod:`sklearn.impute` module. diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py new file mode 100644 index 0000000000000..77a12e87a1e8a --- /dev/null +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -0,0 +1,126 @@ +""" +========================================================= +Imputing missing values with variants of IterativeImputer +========================================================= + +The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be +used with a variety of estimators to do round-robin regression, treating every +variable as an output in turn. + +In this example we compare some estimators for the purpose of missing feature +imputation with :class:`sklearn.imputeIterativeImputer`:: + + :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression + :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression + :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R + :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN + imputation approaches + +Of particular interest is the ability of +:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a +popular imputation package for R. In this example, we have chosen to use +:class:`sklearn.ensemble.ExtraTreesRegressor` instead of +:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its +increased speed. + +Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN +imputation, which learns from samples with missing values by using a distance +metric that accounts for missing values, rather than imputing them. + +The goal is to compare different estimators to see which one is best for the +:class:`sklearn.impute.IterativeImputer` when using a +:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing +dataset with a single value randomly removed from each row. + +For this particular pattern of missing values we see that +:class:`sklearn.ensemble.ExtraTreesRegressor` and +:class:`sklearn.linear_model.BayesianRidge` give the best results. +""" +print(__doc__) + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.datasets import fetch_california_housing +from sklearn.impute import SimpleImputer +from sklearn.impute import IterativeImputer +from sklearn.linear_model import BayesianRidge +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.neighbors import KNeighborsRegressor +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import cross_val_score + +N_SPLITS = 5 + +rng = np.random.RandomState(0) + +X_full, y_full = fetch_california_housing(return_X_y=True) +n_samples, n_features = X_full.shape + +# Estimate the score on the entire dataset, with no missing values +br_estimator = BayesianRidge() +score_full_data = pd.DataFrame( + cross_val_score( + br_estimator, X_full, y_full, scoring='neg_mean_squared_error', + cv=N_SPLITS + ), + columns=['Full Data'] +) + +# Add a single missing value to each row +X_missing = X_full.copy() +y_missing = y_full +missing_samples = np.arange(n_samples) +missing_features = rng.choice(n_features, n_samples, replace=True) +X_missing[missing_samples, missing_features] = np.nan + +# Estimate the score after imputation (mean and median strategies) +score_simple_imputer = pd.DataFrame() +for strategy in ('mean', 'median'): + estimator = make_pipeline( + SimpleImputer(missing_values=np.nan, strategy=strategy), + br_estimator + ) + score_simple_imputer[strategy] = cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) + +# Estimate the score after iterative imputation of the missing values +# with different estimators +estimators = [ + BayesianRidge(), + DecisionTreeRegressor(max_features='sqrt', random_state=0), + ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0), + KNeighborsRegressor(n_neighbors=15) +] +score_iterative_imputer = pd.DataFrame() +for estimator in estimators: + estimator = make_pipeline( + IterativeImputer(random_state=0, estimator=estimator), + br_estimator + ) + score_iterative_imputer[estimator.__class__.__name__] = \ + cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) + +scores = pd.concat( + [score_full_data, score_simple_imputer, score_iterative_imputer], + keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 +) + +# plot boston results +fig, ax = plt.subplots(figsize=(13, 6)) +means = -scores.mean() +errors = scores.std() +means.plot.barh(xerr=errors, ax=ax) +ax.set_title('California Housing Regression with Different Imputation Methods') +ax.set_xlabel('MSE (smaller is better)') +ax.set_yticks(np.arange(means.shape[0])) +ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()]) +plt.tight_layout(pad=1) +plt.show() diff --git a/examples/plot_missing_values.py b/examples/impute/plot_missing_values.py similarity index 62% rename from examples/plot_missing_values.py rename to examples/impute/plot_missing_values.py index 755943fb55bda..897b66aad246c 100644 --- a/examples/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -3,21 +3,23 @@ Imputing missing values before building an estimator ==================================================== -This example shows that imputing the missing values can give better -results than discarding the samples containing any missing value. -Imputing does not always improve the predictions, so please check via -cross-validation. Sometimes dropping rows or using marker values is -more effective. - Missing values can be replaced by the mean, the median or the most frequent -value using the basic :func:`sklearn.impute.SimpleImputer`. +value using the basic :class:`sklearn.impute.SimpleImputer`. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). +Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +round-robin linear regression, treating every variable as an output in +turn. The version implemented assumes Gaussian (output) variables. If your +features are obviously non-Normal, consider transforming them to look more +Normal so as to potentially improve performance. + In addition of using an imputing method, we can also keep an indication of the missing information using :func:`sklearn.impute.MissingIndicator` which might carry some information. """ +print(__doc__) + import numpy as np import matplotlib.pyplot as plt @@ -25,11 +27,24 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import SimpleImputer, MissingIndicator +from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) +N_SPLITS = 5 +REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100) + + +def get_scores_for_imputer(imputer, X_missing, y_missing): + estimator = make_pipeline( + make_union(imputer, MissingIndicator(missing_values=0)), + REGRESSOR) + impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=N_SPLITS) + return impute_scores + def get_results(dataset): X_full, y_full = dataset.data, dataset.target @@ -37,9 +52,9 @@ def get_results(dataset): n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - full_scores = cross_val_score(estimator, X_full, y_full, - scoring='neg_mean_squared_error', cv=5) + full_scores = cross_val_score(REGRESSOR, X_full, y_full, + scoring='neg_mean_squared_error', + cv=N_SPLITS) # Add missing values in 75% of the lines missing_rate = 0.75 @@ -50,32 +65,32 @@ def get_results(dataset): dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) - - # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + + # Estimate the score after replacing missing values by 0 + imputer = SimpleImputer(missing_values=0, + strategy='constant', + fill_value=0) + zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after imputation (mean strategy) of the missing values - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - estimator = make_pipeline( - make_union(SimpleImputer(missing_values=0, strategy="mean"), - MissingIndicator(missing_values=0)), - RandomForestRegressor(random_state=0, n_estimators=100)) - mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + imputer = SimpleImputer(missing_values=0, strategy="mean") + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + # Estimate the score after iterative imputation of the missing values + imputer = IterativeImputer(missing_values=0, + random_state=0, + n_nearest_features=5) + iterative_impute_scores = get_scores_for_imputer(imputer, + X_missing, + y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std())) + (mean_impute_scores.mean(), mean_impute_scores.std()), + (iterative_impute_scores.mean(), iterative_impute_scores.std())) results_diabetes = np.array(get_results(load_diabetes())) @@ -91,7 +106,8 @@ def get_results(dataset): x_labels = ['Full data', 'Zero imputation', - 'Mean Imputation'] + 'Mean Imputation', + 'Multivariate Imputation'] colors = ['r', 'g', 'b', 'orange'] # plot diabetes results diff --git a/sklearn/impute.py b/sklearn/impute.py index fdafa6ad6e198..3bb0bdd9eff15 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -3,16 +3,25 @@ # Sergey Feldman # License: BSD 3 clause +from __future__ import division + import warnings import numbers +from time import time +from distutils.version import LooseVersion import numpy as np import numpy.ma as ma +import scipy from scipy import sparse from scipy import stats +from collections import namedtuple from .base import BaseEstimator, TransformerMixin -from .utils import check_array +from .base import clone +from .exceptions import ConvergenceWarning +from .preprocessing import normalize +from .utils import check_array, check_random_state, safe_indexing from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES @@ -20,9 +29,14 @@ from .utils import is_scalar_nan +ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', + 'neighbor_feat_idx', + 'estimator']) + __all__ = [ 'MissingIndicator', 'SimpleImputer', + 'IterativeImputer', ] @@ -134,6 +148,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin): statistics_ : array of shape (n_features,) The imputation fill value for each feature. + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + Examples -------- >>> import numpy as np @@ -405,6 +423,618 @@ def transform(self, X): return X +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. + + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object, default=BayesianRidge() + The estimator to use at each step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + + missing_values : int, np.nan, optional (default=np.nan) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be imputed. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted estimator for each imputation. Estimator must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. + + max_iter : int, optional (default=10) + Maximum number of imputation rounds to perform before returning the + imputations computed during the final round. A round is a single + imputation of each feature with missing values. The stopping criterion + is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, + where `X_t` is `X` at iteration `t. Note that early stopping is only + applied if ``sample_posterior=False``. + + tol : float, optional (default=1e-3) + Tolerance of the stopping condition. + + n_nearest_features : int, optional (default=None) + Number of other features to use to estimate the missing values of + each feature column. Nearness between features is measured using + the absolute correlation coefficient between each feature pair (after + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. + + initial_strategy : str, optional (default="mean") + Which strategy to use to initialize the missing values. Same as the + ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` + Valid values: {"mean", "median", "most_frequent", or "constant"}. + + imputation_order : str, optional (default="ascending") + The order in which the features will be imputed. Possible values: + + "ascending" + From features with fewest missing values to most. + "descending" + From features with most missing values to fewest. + "roman" + Left to right. + "arabic" + Right to left. + "random" + A random order for each round. + + min_value : float, optional (default=None) + Minimum possible imputed value. Default of ``None`` will set minimum + to negative infinity. + + max_value : float, optional (default=None) + Maximum possible imputed value. Default of ``None`` will set maximum + to positive infinity. + + verbose : int, optional (default=0) + Verbosity flag, controls the debug messages that are issued + as functions are evaluated. The higher, the more verbose. Can be 0, 1, + or 2. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use. Randomizes + selection of estimator features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. + + Attributes + ---------- + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. + + imputation_sequence_ : list of tuples + Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where + ``feat_idx`` is the current feature to be imputed, + ``neighbor_feat_idx`` is the array of other features used to impute the + current feature, and ``estimator`` is the trained estimator used for + the imputation. Length is ``self.n_features_with_missing_ * + self.n_iter_``. + + n_iter_ : int + Number of iteration rounds that occurred. Will be less than + ``self.max_iter`` if early stopping criterion was reached. + + n_features_with_missing_ : int + Number of features with missing values. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. + + Notes + ----- + To support imputation in inductive mode we store each feature's estimator + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. + + Features which contain all missing values at ``fit`` are discarded upon + ``transform``. + + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. + + References + ---------- + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ + """ + + def __init__(self, + estimator=None, + missing_values=np.nan, + sample_posterior=False, + max_iter=10, + tol=1e-3, + n_nearest_features=None, + initial_strategy="mean", + imputation_order='ascending', + min_value=None, + max_value=None, + verbose=0, + random_state=None): + + self.estimator = estimator + self.missing_values = missing_values + self.sample_posterior = sample_posterior + self.max_iter = max_iter + self.tol = tol + self.n_nearest_features = n_nearest_features + self.initial_strategy = initial_strategy + self.imputation_order = imputation_order + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.random_state = random_state + + def _impute_one_feature(self, + X_filled, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + estimator=None, + fit_mode=True): + """Impute a single feature from the others provided. + + This function predicts the missing values of one of the features using + the current estimates of all the other features. The ``estimator`` must + support ``return_std=True`` in its ``predict`` method for this function + to work. + + Parameters + ---------- + X_filled : ndarray + Input data with the most recent imputations. + + mask_missing_values : ndarray + Input data's missing indicator matrix. + + feat_idx : int + Index of the feature currently being imputed. + + neighbor_feat_idx : ndarray + Indices of the features to be used in imputing ``feat_idx``. + + estimator : object + The estimator to use at this step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + If None, it will be cloned from self._estimator. + + fit_mode : boolean, default=True + Whether to fit and predict with the estimator or just predict. + + Returns + ------- + X_filled : ndarray + Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. + + estimator : estimator with sklearn API + The fitted estimator used to impute + ``X_filled[missing_row_mask, feat_idx]``. + """ + + # if nothing is missing, just return the default + # (should not happen at fit time because feat_ids would be excluded) + missing_row_mask = mask_missing_values[:, feat_idx] + if not np.any(missing_row_mask): + return X_filled, estimator + + if estimator is None and fit_mode is False: + raise ValueError("If fit_mode is False, then an already-fitted " + "estimator should be passed in.") + + if estimator is None: + estimator = clone(self._estimator) + + if fit_mode: + X_train = safe_indexing(X_filled[:, neighbor_feat_idx], + ~missing_row_mask) + y_train = safe_indexing(X_filled[:, feat_idx], + ~missing_row_mask) + estimator.fit(X_train, y_train) + + # get posterior samples + X_test = safe_indexing(X_filled[:, neighbor_feat_idx], + missing_row_mask) + if self.sample_posterior: + mus, sigmas = estimator.predict(X_test, return_std=True) + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + # two types of problems: (1) non-positive sigmas, (2) mus outside + # legal range of min_value and max_value (results in inf sample) + positive_sigmas = sigmas > 0 + imputed_values[~positive_sigmas] = mus[~positive_sigmas] + mus_too_low = mus < self._min_value + imputed_values[mus_too_low] = self._min_value + mus_too_high = mus > self._max_value + imputed_values[mus_too_high] = self._max_value + # the rest can be sampled without statistical issues + inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high + mus = mus[inrange_mask] + sigmas = sigmas[inrange_mask] + a = (self._min_value - mus) / sigmas + b = (self._max_value - mus) / sigmas + + if scipy.__version__ < LooseVersion('0.18'): + # bug with vector-valued `a` in old scipy + imputed_values[inrange_mask] = [ + stats.truncnorm(a=a_, b=b_, + loc=loc_, scale=scale_).rvs( + random_state=self.random_state_) + for a_, b_, loc_, scale_ + in zip(a, b, mus, sigmas)] + else: + truncated_normal = stats.truncnorm(a=a, b=b, + loc=mus, scale=sigmas) + imputed_values[inrange_mask] = truncated_normal.rvs( + random_state=self.random_state_) + else: + imputed_values = estimator.predict(X_test) + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) + + # update the feature + X_filled[missing_row_mask, feat_idx] = imputed_values + return X_filled, estimator + + def _get_neighbor_feat_idx(self, + n_features, + feat_idx, + abs_corr_mat): + """Get a list of other features to predict ``feat_idx``. + + If self.n_nearest_features is less than or equal to the total + number of features, then use a probability proportional to the absolute + correlation between ``feat_idx`` and each other feature to randomly + choose a subsample of the other features (without replacement). + + Parameters + ---------- + n_features : int + Number of features in ``X``. + + feat_idx : int + Index of the feature currently being imputed. + + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X``. The diagonal has been zeroed + out and each feature has been normalized to sum to 1. Can be None. + + Returns + ------- + neighbor_feat_idx : array-like + The features to use to impute ``feat_idx``. + """ + if (self.n_nearest_features is not None and + self.n_nearest_features < n_features): + p = abs_corr_mat[:, feat_idx] + neighbor_feat_idx = self.random_state_.choice( + np.arange(n_features), self.n_nearest_features, replace=False, + p=p) + else: + inds_left = np.arange(feat_idx) + inds_right = np.arange(feat_idx + 1, n_features) + neighbor_feat_idx = np.concatenate((inds_left, inds_right)) + return neighbor_feat_idx + + def _get_ordered_idx(self, mask_missing_values): + """Decide in what order we will update the features. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function skips features which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + + Returns + ------- + ordered_idx : ndarray, shape (n_features,) + The order in which to impute the features. + """ + frac_of_missing_values = mask_missing_values.mean(axis=0) + missing_values_idx = np.nonzero(frac_of_missing_values)[0] + if self.imputation_order == 'roman': + ordered_idx = missing_values_idx + elif self.imputation_order == 'arabic': + ordered_idx = missing_values_idx[::-1] + elif self.imputation_order == 'ascending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:][::-1] + elif self.imputation_order == 'descending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:] + elif self.imputation_order == 'random': + ordered_idx = missing_values_idx + self.random_state_.shuffle(ordered_idx) + else: + raise ValueError("Got an invalid imputation order: '{0}'. It must " + "be one of the following: 'roman', 'arabic', " + "'ascending', 'descending', or " + "'random'.".format(self.imputation_order)) + return ordered_idx + + def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): + """Get absolute correlation matrix between features. + + Parameters + ---------- + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + tolerance : float, optional (default=1e-6) + ``abs_corr_mat`` can have nans, which will be replaced + with ``tolerance``. + + Returns + ------- + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X`` at the beginning of the + current round. The diagonal has been zeroed out and each feature's + absolute correlations with all others have been normalized to sum + to 1. + """ + n_features = X_filled.shape[1] + if (self.n_nearest_features is None or + self.n_nearest_features >= n_features): + return None + abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for features with zero std + abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance + # ensures exploration, i.e. at least some probability of sampling + np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) + # features are not their own neighbors + np.fill_diagonal(abs_corr_mat, 0) + # needs to sum to 1 for np.random.choice sampling + abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) + return abs_corr_mat + + def _initial_imputation(self, X): + """Perform initial imputation for input X. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + Returns + ------- + Xt : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + mask_missing_values : ndarray, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + """ + if is_scalar_nan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = check_array(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + + mask_missing_values = _get_mask(X, self.missing_values) + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) + + valid_mask = np.flatnonzero(np.logical_not( + np.isnan(self.initial_imputer_.statistics_))) + Xt = X[:, valid_mask] + mask_missing_values = mask_missing_values[:, valid_mask] + + return Xt, X_filled, mask_missing_values + + def fit_transform(self, X, y=None): + """Fits the imputer on X and return the transformed X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + self.random_state_ = getattr(self, "random_state_", + check_random_state(self.random_state)) + + if self.max_iter < 0: + raise ValueError( + "'max_iter' should be a positive integer. Got {} instead." + .format(self.max_iter)) + + if self.tol < 0: + raise ValueError( + "'tol' should be a non-negative float. Got {} instead." + .format(self.tol) + ) + + if self.estimator is None: + from .linear_model import BayesianRidge + self._estimator = BayesianRidge() + else: + self._estimator = clone(self.estimator) + + self.imputation_sequence_ = [] + + if hasattr(self._estimator, 'random_state'): + self._estimator.random_state = self.random_state_ + + self._min_value = -np.inf if self.min_value is None else self.min_value + self._max_value = np.inf if self.max_value is None else self.max_value + + self.initial_imputer_ = None + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.max_iter == 0 or np.all(mask_missing_values): + self.n_iter_ = 0 + return Xt + + # order in which to impute + # note this is probably too slow for large feature data (d > 100000) + # and a better way would be good. + # see: https://goo.gl/KyCNwj and subsequent comments + ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) + + abs_corr_mat = self._get_abs_corr_mat(Xt) + + n_samples, n_features = Xt.shape + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + if not self.sample_posterior: + Xt_previous = Xt.copy() + normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) + for self.n_iter_ in range(1, self.max_iter + 1): + if self.imputation_order == 'random': + ordered_idx = self._get_ordered_idx(mask_missing_values) + + for feat_idx in ordered_idx: + neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, + feat_idx, + abs_corr_mat) + Xt, estimator = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, + estimator=None, fit_mode=True) + estimator_triplet = ImputerTriplet(feat_idx, + neighbor_feat_idx, + estimator) + self.imputation_sequence_.append(estimator_triplet) + + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (self.n_iter_, self.max_iter, time() - start_t)) + + if not self.sample_posterior: + inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, + axis=None) + if inf_norm < normalized_tol: + if self.verbose > 0: + print('[IterativeImputer] Early stopping criterion ' + 'reached.') + break + Xt_previous = Xt.copy() + else: + if not self.sample_posterior: + warnings.warn("[IterativeImputer] Early stopping criterion not" + " reached.", ConvergenceWarning) + Xt[~mask_missing_values] = X[~mask_missing_values] + return Xt + + def transform(self, X): + """Imputes all missing values in X. + + Note that this is stochastic, and that if random_state is not fixed, + repeated calls, or permuted input, will yield different results. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + check_is_fitted(self, 'initial_imputer_') + + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.n_iter_ == 0 or np.all(mask_missing_values): + return Xt + + imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ + i_rnd = 0 + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + for it, estimator_triplet in enumerate(self.imputation_sequence_): + Xt, _ = self._impute_one_feature( + Xt, + mask_missing_values, + estimator_triplet.feat_idx, + estimator_triplet.neighbor_feat_idx, + estimator=estimator_triplet.estimator, + fit_mode=False + ) + if not (it + 1) % imputations_per_round: + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (i_rnd + 1, self.n_iter_, time() - start_t)) + i_rnd += 1 + + Xt[~mask_missing_values] = X[~mask_missing_values] + return Xt + + def fit(self, X, y=None): + """Fits the imputer on X and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored + + Returns + ------- + self : object + Returns self. + """ + self.fit_transform(X) + return self + + class MissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 058cce9a33834..dfa0134d5ab42 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1,7 +1,10 @@ +from __future__ import division + import pytest import numpy as np from scipy import sparse +from scipy.stats import kstest import io @@ -11,7 +14,9 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.impute import MissingIndicator -from sklearn.impute import SimpleImputer +from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.dummy import DummyRegressor +from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV from sklearn.pipeline import Pipeline from sklearn.pipeline import make_union from sklearn.model_selection import GridSearchCV @@ -69,6 +74,10 @@ def test_imputation_shape(): X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) + iterative_imputer = IterativeImputer(initial_strategy=strategy) + X_imputed = iterative_imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) + @pytest.mark.parametrize("strategy", ["const", 101, None]) def test_imputation_error_invalid_strategy(strategy): @@ -500,6 +509,409 @@ def test_imputation_copy(): # made, even if copy=False. +def test_iterative_imputer_zero_iters(): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + missing_flag = X == 0 + X[missing_flag] = np.nan + + imputer = IterativeImputer(max_iter=0) + X_imputed = imputer.fit_transform(X) + # with max_iter=0, only initial imputation is performed + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + # repeat but force n_iter_ to 0 + imputer = IterativeImputer(max_iter=5).fit(X) + # transformed should not be equal to initial imputation + assert not np.all(imputer.transform(X) == + imputer.initial_imputer_.transform(X)) + + imputer.n_iter_ = 0 + # now they should be equal as only initial imputation is done + assert_allclose(imputer.transform(X), + imputer.initial_imputer_.transform(X)) + + +def test_iterative_imputer_verbose(): + rng = np.random.RandomState(0) + + n = 100 + d = 3 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) + imputer.fit(X) + imputer.transform(X) + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) + imputer.fit(X) + imputer.transform(X) + + +def test_iterative_imputer_all_missing(): + n = 100 + d = 3 + X = np.zeros((n, d)) + imputer = IterativeImputer(missing_values=0, max_iter=1) + X_imputed = imputer.fit_transform(X) + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + +@pytest.mark.parametrize( + "imputation_order", + ['random', 'roman', 'ascending', 'descending', 'arabic'] +) +def test_iterative_imputer_imputation_order(imputation_order): + rng = np.random.RandomState(0) + n = 100 + d = 10 + max_iter = 2 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 # this column should not be discarded by IterativeImputer + + imputer = IterativeImputer(missing_values=0, + max_iter=max_iter, + n_nearest_features=5, + sample_posterior=False, + min_value=0, + max_value=1, + verbose=1, + imputation_order=imputation_order, + random_state=rng) + imputer.fit_transform(X) + ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] + + assert (len(ordered_idx) // imputer.n_iter_ == + imputer.n_features_with_missing_) + + if imputation_order == 'roman': + assert np.all(ordered_idx[:d-1] == np.arange(1, d)) + elif imputation_order == 'arabic': + assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) + elif imputation_order == 'random': + ordered_idx_round_1 = ordered_idx[:d-1] + ordered_idx_round_2 = ordered_idx[d-1:] + assert ordered_idx_round_1 != ordered_idx_round_2 + elif 'ending' in imputation_order: + assert len(ordered_idx) == max_iter * (d - 1) + + +@pytest.mark.parametrize( + "estimator", + [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] +) +def test_iterative_imputer_estimators(estimator): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + + imputer = IterativeImputer(missing_values=0, + max_iter=1, + estimator=estimator, + random_state=rng) + imputer.fit_transform(X) + + # check that types are correct for estimators + hashes = [] + for triplet in imputer.imputation_sequence_: + expected_type = (type(estimator) if estimator is not None + else type(BayesianRidge())) + assert isinstance(triplet.estimator, expected_type) + hashes.append(id(triplet.estimator)) + + # check that each estimator is unique + assert len(set(hashes)) == len(hashes) + + +def test_iterative_imputer_clip(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, + random_state=rng).toarray() + + imputer = IterativeImputer(missing_values=0, + max_iter=1, + min_value=0.1, + max_value=0.2, + random_state=rng) + + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + +def test_iterative_imputer_clip_truncnorm(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 + + imputer = IterativeImputer(missing_values=0, + max_iter=2, + n_nearest_features=5, + sample_posterior=True, + min_value=0.1, + max_value=0.2, + verbose=1, + imputation_order='random', + random_state=rng) + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + +def test_iterative_imputer_truncated_normal_posterior(): + # test that the values that are imputed using `sample_posterior=True` + # with boundaries (`min_value` and `max_value` are not None) are drawn + # from a distribution that looks gaussian via the Kolmogorov Smirnov test. + # note that starting from the wrong random seed will make this test fail + # because random sampling doesn't occur at all when the imputation + # is outside of the (min_value, max_value) range + pytest.importorskip("scipy", minversion="0.17.0") + rng = np.random.RandomState(42) + + X = rng.normal(size=(5, 5)) + X[0][0] = np.nan + + imputer = IterativeImputer(min_value=0, + max_value=0.5, + sample_posterior=True, + random_state=rng) + + imputer.fit_transform(X) + # generate multiple imputations for the single missing value + imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) + + assert all(imputations >= 0) + assert all(imputations <= 0.5) + + mu, sigma = imputations.mean(), imputations.std() + ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') + if sigma == 0: + sigma += 1e-12 + ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') + # we want to fail to reject null hypothesis + # null hypothesis: distributions are the same + assert ks_statistic < 0.2 or p_value > 0.1, \ + "The posterior does appear to be normal" + + +@pytest.mark.parametrize( + "strategy", + ["mean", "median", "most_frequent"] +) +def test_iterative_imputer_missing_at_transform(strategy): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X_train = rng.randint(low=0, high=3, size=(n, d)) + X_test = rng.randint(low=0, high=3, size=(n, d)) + + X_train[:, 0] = 1 # definitely no missing values in 0th column + X_test[0, 0] = 0 # definitely missing value in 0th column + + imputer = IterativeImputer(missing_values=0, + max_iter=1, + initial_strategy=strategy, + random_state=rng).fit(X_train) + initial_imputer = SimpleImputer(missing_values=0, + strategy=strategy).fit(X_train) + + # if there were no missing values at time of fit, then imputer will + # only use the initial imputer for that feature at transform + assert np.all(imputer.transform(X_test)[:, 0] == + initial_imputer.transform(X_test)[:, 0]) + + +def test_iterative_imputer_transform_stochasticity(): + pytest.importorskip("scipy", minversion="0.17.0") + rng1 = np.random.RandomState(0) + rng2 = np.random.RandomState(1) + n = 100 + d = 10 + X = sparse_random_matrix(n, d, density=0.10, + random_state=rng1).toarray() + + # when sample_posterior=True, two transforms shouldn't be equal + imputer = IterativeImputer(missing_values=0, + max_iter=1, + sample_posterior=True, + random_state=rng1) + imputer.fit(X) + + X_fitted_1 = imputer.transform(X) + X_fitted_2 = imputer.transform(X) + + # sufficient to assert that the means are not the same + assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) + + # when sample_posterior=False, and n_nearest_features=None + # and imputation_order is not random + # the two transforms should be identical even if rng are different + imputer1 = IterativeImputer(missing_values=0, + max_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng1) + + imputer2 = IterativeImputer(missing_values=0, + max_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order='ascending', + random_state=rng2) + imputer1.fit(X) + imputer2.fit(X) + + X_fitted_1a = imputer1.transform(X) + X_fitted_1b = imputer1.transform(X) + X_fitted_2 = imputer2.transform(X) + + assert np.all(X_fitted_1a == X_fitted_1b) + assert np.all(X_fitted_1a == X_fitted_2) + + +def test_iterative_imputer_no_missing(): + rng = np.random.RandomState(0) + X = rng.rand(100, 100) + X[:, 0] = np.nan + m1 = IterativeImputer(max_iter=10, random_state=rng) + m2 = IterativeImputer(max_iter=10, random_state=rng) + pred1 = m1.fit(X).transform(X) + pred2 = m2.fit_transform(X) + # should exclude the first column entirely + assert_allclose(X[:, 1:], pred1) + # fit and fit_transform should both be identical + assert_allclose(pred1, pred2) + + +def test_iterative_imputer_rank_one(): + rng = np.random.RandomState(0) + d = 100 + A = rng.rand(d, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(d, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(max_iter=5, + verbose=1, + random_state=rng) + X_filled = imputer.fit_transform(X_missing) + assert_allclose(X_filled, X, atol=0.01) + + +@pytest.mark.parametrize( + "rank", + [3, 5] +) +def test_iterative_imputer_transform_recovery(rank): + rng = np.random.RandomState(0) + n = 100 + d = 100 + A = rng.rand(n, rank) + B = rng.rand(rank, d) + X_filled = np.dot(A, B) + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data in half + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = IterativeImputer(max_iter=10, + verbose=1, + random_state=rng).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, atol=0.1) + + +def test_iterative_imputer_additive_matrix(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + A = rng.randn(n, d) + B = rng.randn(n, d) + X_filled = np.zeros(A.shape) + for i in range(d): + for j in range(d): + X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 + # a quarter is randomly missing + nan_mask = rng.rand(n, d) < 0.25 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = IterativeImputer(max_iter=10, + verbose=1, + random_state=rng).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01) + + +@pytest.mark.parametrize("max_iter, tol, error_type, warning", [ + (-1, 1e-3, ValueError, 'should be a positive integer'), + (1, -1e-3, ValueError, 'should be a non-negative float') +]) +def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): + X = np.zeros((100, 2)) + imputer = IterativeImputer(max_iter=max_iter, tol=tol) + with pytest.raises(error_type, match=warning): + imputer.fit_transform(X) + + +def test_iterative_imputer_early_stopping(): + rng = np.random.RandomState(0) + n = 50 + d = 5 + A = rng.rand(n, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(max_iter=100, + tol=1e-3, + sample_posterior=False, + verbose=1, + random_state=rng) + X_filled_100 = imputer.fit_transform(X_missing) + assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ + + imputer = IterativeImputer(max_iter=imputer.n_iter_, + sample_posterior=False, + verbose=1, + random_state=rng) + X_filled_early = imputer.fit_transform(X_missing) + assert_allclose(X_filled_100, X_filled_early, atol=1e-7) + + imputer = IterativeImputer(max_iter=100, + tol=0, + sample_posterior=False, + verbose=1, + random_state=rng) + imputer.fit(X_missing) + assert imputer.n_iter_ == imputer.max_iter + + @pytest.mark.parametrize( "X_fit, X_trans, params, msg_err", [(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, -1]]), @@ -650,7 +1062,7 @@ def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): @pytest.mark.parametrize("imputer_constructor", - [SimpleImputer]) + [SimpleImputer, IterativeImputer]) @pytest.mark.parametrize( "imputer_missing_values, missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index af426f048afd4..6c4196d919aa0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -72,9 +72,11 @@ 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] + ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', - 'PowerTransformer', 'QuantileTransformer'] + 'PowerTransformer', 'QuantileTransformer', 'IterativeImputer'] + SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']