diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 5b44889bfae2f..bd5084450c79b 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -303,6 +303,18 @@ Samples generator datasets.make_checkerboard +Missing Value Generator +----------------------- + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + datasets.ValueDropper + + .. _decomposition_ref: :mod:`sklearn.decomposition`: Matrix Decomposition diff --git a/doc/whats_new.rst b/doc/whats_new.rst index cc481740c96f7..087db61294f23 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -35,6 +35,12 @@ New features detection based on nearest neighbors. :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. + - Introduced the :class:`datasets.ValueDropper` transformer to artificially + introduce missing values based on per-class or per-feature + drop-probabilities (for introducing NMAR missingness) or global + drop-probability (for introducing MCAR missingness). + :issue:`7084` by `Raghav RV`_. + Enhancements ............ diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py new file mode 100644 index 0000000000000..659d4844e24bf --- /dev/null +++ b/examples/datasets/generate_missing_values.py @@ -0,0 +1,117 @@ +""" +================================================================ +Data Pertubation: Generating NMAR / MCAR missing_values in data +================================================================ + +This example illustrates how the :class:`sklearn.datasets.ValueDropper` can +be used to generate missing values completely at random or based on the +given drop-probabilities. + +The :class`sklearn.datasets.ValueDropper` is a transformer which can be +initialized with a ``missing_proba`` specifying the drop-probabilites +for each class label (and each feature if needed). This facilitates +benchmarking missing-value strategies and evaluating the performance of such +strategies with respect to the type, extent and distribution of missingness in +the data. Importantly, when ``random_state`` is set to an integer, it +provisions preserving the drop-locations as the ``missing_proba`` is increased +to study the effect of the more missing values. This allows benchmarking +with incremental missing rates without causing variation in the results due to +an inconsistency in the drop-locations between different scales of +``missing_proba``. + +NMAR or Not Missing At Random refers to the case when the missingness in the +data is distributed not at random. It is either correlated with the target +value(s) or with the data itself. In some references it is also refered to as +MNAR or Missing Not At Random. + +MCAR or Missing Completely At Random refers to the case when the missingness +in the data is completely random and does not correlate with the classification +target value(s) or the data. +""" +# Author: Raghav RV +# +# License: BSD 3 clause + +from __future__ import print_function + +import numpy as np +from sklearn.datasets import ValueDropper + +print(__doc__) + + +X = np.random.RandomState(0).random_sample((20, 3)) +y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2]) + +# For samples from class 1, each feature will be missing 20% of its values +vd = ValueDropper(missing_proba={1: 0.2}, random_state=0) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping 20% of values (per feature) in samples of class 1:") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Each feature of samples of class 1 will have a further 20% of its values +# missing. (Old locations will be preserved as random_state is set) +vd = ValueDropper(missing_proba={1: 0.4}, random_state=0) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping another 20% of values (per feature) in samples of " + "class 1:") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop 30% of values in each feature completely at random + +vd = ValueDropper(missing_proba=0.3, random_state=0) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping 30% of values randomly:") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop values based on the given drop-probabilities - + +# For samples of class 0, drop 10% of values (in each feature) +# For samples of class 2, drop 20% of values in feature 0, 40% in feature 1 +# and None in feature 2 +# Don't drop any values for samples of class 1. +missing_proba = {0: 0.1, 2: [0.2, 0.4, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping one set of missing values based on the " + "missing_proba=%s" % missing_proba) +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop twice as many missing values as in previous step. +missing_proba = {0: 0.2, 2: [0.4, 0.6, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) +X_dropped = vd.transform(X, y) +print("\nAfter dropping another set of missing values based on the new " + "missing_proba=%s" % missing_proba) +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop more values and also drop 40% of values from samples of class 1 +# (in each feature) +missing_proba = {0: 0.3, 1: 0.4, 2: [0.6, 0.8, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) +X_dropped = vd.transform(X, y) +print("\nAfter dropping another set of missing values based on the new " + "missing_proba=%s" % missing_proba) +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 0a8cfc62df537..e4a4f677b6172 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -52,6 +52,7 @@ from .species_distributions import fetch_species_distributions from .california_housing import fetch_california_housing from .rcv1 import fetch_rcv1 +from .value_dropper import ValueDropper __all__ = ['clear_data_home', @@ -102,4 +103,5 @@ 'make_sparse_uncorrelated', 'make_spd_matrix', 'make_swiss_roll', - 'mldata_filename'] + 'mldata_filename', + 'ValueDropper'] diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py new file mode 100644 index 0000000000000..2b5f231f16306 --- /dev/null +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -0,0 +1,232 @@ +import numpy as np + +from sklearn.datasets import ValueDropper +from sklearn.datasets import make_classification, make_regression +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_raise_message +from sklearn.utils.testing import assert_raises_regexp +from sklearn.preprocessing import LabelEncoder + + +def test_value_dropper_mnar_clf(): + # Test drop probabilites when missing distribution is + # given for classification problems + n_samples, n_features = 1000, 5 + X, y = make_classification(n_samples=n_samples, + n_classes=4, + n_features=n_features, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=0) + le = LabelEncoder().fit(['a', 'z', 'b', 'j']) + y_str = le.inverse_transform(y) + y_int = y + + for y in (y_int, y_str): + classes = np.unique(y) + + # Inplace dropping of values + + # Samples from class 0 will have a drop-probability of 0.1 + vd = ValueDropper(missing_proba={classes[0]: 0.1}, + missing_values=np.nan, random_state=0) + X_dropped = vd.transform(X, y) + missing_mask = np.isnan(X_dropped) + + # Check the drop-probabilty for class 0 + assert_almost_equal(missing_mask[y == classes[0]].sum() / + float(np.sum(y == classes[0]) * n_features), 0.1, + decimal=2) + + # Check drop-probability for samples of class 0 + assert_almost_equal( + missing_mask[y == classes[0]].ravel().sum() / + float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2) + + # and no missing values from y != 0 + assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0) + + # Samples from class 0 will have 50% of values missing in each feature + # And samples from class 1 will have a drop-probabilities as specified + # by a list of drop-probabilites for each feature + missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], classes[1]: 0.5} + vd = ValueDropper(missing_proba=missing_proba, missing_values=np.nan, + random_state=0) + X_dropped = vd.transform(X, y) + + missing_mask = np.isnan(X_dropped) + # Check that there are no missing values when y != {0 or 1} + assert_equal(missing_mask[(y == classes[2])].ravel().sum(), 0) + assert_equal(missing_mask[(y == classes[3])].ravel().sum(), 0) + + # Check that the drop probabilites for samples of class 1 is 0.5 + # across all features + assert_array_almost_equal( + missing_mask[y == classes[1]].sum(axis=0) / + float(np.sum(y == classes[1])), [0.5] * n_features, decimal=2) + + # Check that the drop probabilites when class == 0 are as given by + # the missing_proba dict + assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) / + float(np.sum(y == classes[0])), + missing_proba[classes[0]], + decimal=2) + + # Ensure scaling up the missing_proba retains previously dropped + # locations as long as random_state is set + # The up scaling need not be linear + missing_proba = {classes[0]: [0.1, 0.5, 0.5, 0.1, 0], classes[1]: 0.8} + vd = ValueDropper(missing_proba=missing_proba, + missing_values=-100.2, random_state=0) + X_dropped2 = vd.transform(X, y) + new_missing_mask = X_dropped2 == -100.2 + assert_true(np.all(new_missing_mask[missing_mask])) + + +def test_value_dropper_mnar_reg_error(): + X, y = make_regression(n_samples=10, random_state=0) + + assert_raise_message(ValueError, + "only for single target which is discrete" + " (classification tasks). The given target (y) is of " + "type continuous", + ValueDropper(missing_proba={0: 0.2}).transform, + X, y) + + +def check_value_dropper_mcar(X, y): + X_copy = X.copy() + X_copy2 = X.copy() + n_samples, n_features = X.shape + n_values = n_samples * n_features + + # Inplace dropping of values; 0 correlation case. + # For even indexed features missing drop-probability is 0.3 and + # for odd indexed ones 0.1 + # (Also check if inplace operation works as expected) + missing_proba = np.array([0.3, 0.1] * 5) + vd = ValueDropper(missing_proba=missing_proba, copy=False, random_state=0) + vd.transform(X_copy, y) + missing_mask = np.isnan(X_copy) + + global_missing_rate = missing_proba.mean() # 0.2 + + # Check the global missing rate + assert_almost_equal(missing_mask.ravel().sum() / float(n_values), + global_missing_rate) + + # Check the rate for all even indexed features + even_feature_missing_mask = missing_mask[:, missing_proba == 0.3] + assert_almost_equal(even_feature_missing_mask.ravel().sum() / + float(even_feature_missing_mask.size), 0.3) + + # Check the rate for all odd features + odd_feature_missing_mask = missing_mask[:, missing_proba == 0.1] + assert_almost_equal(odd_feature_missing_mask.ravel().sum() / + float(odd_feature_missing_mask.size), 0.1) + + # Let us drop 0.3 more fraction of values. This time not inplace + # copy=True must be default + # Check with inf as missing values + vd = ValueDropper(missing_proba=0.6, missing_values=np.inf, random_state=0) + X_more_dropped = vd.transform(X_copy2, y) + new_missing_mask = np.isinf(X_more_dropped) + + # Check global drop probability + assert_almost_equal(new_missing_mask.ravel().sum() / float(n_values), 0.6) + # Check the drop-probability for a random feature 3 + assert_almost_equal(new_missing_mask[:, 3].ravel().sum() / + float(n_samples), 0.6) + + # Ensure X is not modified + assert_array_almost_equal(X_copy2, X) + + # Ensure all the missing positions that were in the previous step also + # exist when missing_proba is scaled up + # (Important for reproducibility) + assert_true(np.all(new_missing_mask[missing_mask])) + + +def test_value_dropper_mcar(): + # Test missing fractions for MCAR case in a classification problem + n_samples, n_features = 1000, 10 + X, y_int = make_classification(n_samples=n_samples, + n_features=n_features, random_state=0) + le = LabelEncoder().fit(['a', 'z']) + y_str = le.inverse_transform(y_int) + for y in (y_str, y_int): + check_value_dropper_mcar(X, y) + + # Test missing fractions for MCAR case in a regression problem + n_samples, n_features = 1000, 10 + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=0) + check_value_dropper_mcar(X, y) + + +def test_value_dropper_errors(): + n_samples, n_features = 1000, 10 + X, y = make_classification(n_samples=n_samples, + n_classes=4, + n_features=n_features, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=0) + + # Raise sensible error when any probability is outside the range [0, 1] + missing_probas = ( + # NMAR cases + {0: 2., 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2, }, {0: -2, }, {0: 2.0, }, + {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01], }, + # MCAR cases + [0, 0, 0, 0.2, 0.3, -0.1, 0, 0, 0, 0.5], 2.5, 1.5, + [0, -1, 0, 0, 0, 0, 0, 0, 0, 0], 2, -2) + for missing_proba in missing_probas: + assert_raise_message(ValueError, + "should be within the range of [0, 1]", + ValueDropper( + missing_proba=missing_proba).transform, X, y) + + wrong_missing_probas_err_pairs = ( + # 1D vector with fewer or more than n_feature elements + ([0.01, ] * 9, "does not conform to the number of features, 10"), + ([0.01, ] * 11, "does not conform to the number of features, 10"), + + # Dict with labels having fewer or more than n_feature elements + ({1: [0.01, ] * 9, }, + "For label, 1, the shape of the per feature drop-probabilities " + "vector does not conform to the number of features, 10"), + + ({0: [0.01, ] * 11, 1: [0.01, ] * 10}, + "For label, 0, the shape of the per feature drop-probabilities " + "vector does not conform to the number of features, 10"), + + # Dict having labels not present in y labels + ({0: 0.025, 1: [0.0025, ] * 10, 2: 0.025, 3: 0.025, 4: 0.025}, + "y contains new labels: \[4\]"), + + # Incorrect dict or incorrect value + ({0: 'foo', }, + "For label, 0, probability value must be a float or 1D vector \(list," + " tuple or np.ndarray\) of shape \(n_features,\) \'foo\' was passed"), + + ("foobar", + "must be a float or 1D vector \(list, tuple or np.ndarray\)" + " of shape \(n_features,\) or dict of floats/1D vectors. " + "'foobar' was passed.")) + + for missing_proba, err_msg in wrong_missing_probas_err_pairs: + assert_raises_regexp(ValueError, err_msg, + ValueDropper(missing_proba=missing_proba) + .transform, X, y) + + # When missing_proba is a dict, but y is not given + missing_proba = {0: 0.025} + assert_raise_message( + ValueError, "The missing_proba is a dict but y is None.", + ValueDropper(missing_proba=missing_proba).transform, X) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py new file mode 100644 index 0000000000000..29ca635d131ad --- /dev/null +++ b/sklearn/datasets/value_dropper.py @@ -0,0 +1,295 @@ +# Author : Raghav RV +# +# Licence : BSD 3 clause + +import numpy as np +import numbers + +from sklearn.utils import check_array +from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target + +from sklearn.base import TransformerMixin +from sklearn.preprocessing import LabelEncoder + + +__all__ = ["ValueDropper"] + + +class ValueDropper(TransformerMixin): + """Artificially insert NMAR or MCAR missing values into data. + + Where, + + NMAR/MNAR - Not Missing At Random / Missing Not At Random + When the missingness is correlated with the class classes in the + target (y) (and hence informative). + + MCAR - Missing Completely At Random + When the missingness is completely random (and hence uninformative). + + If the missing type is NMAR, a ``missing_proba`` parameter can be passed + to drop values conforming to the given drop-probabilities. + + + Parameters + ---------- + + missing_values : {"NaN" (or np.nan) | int | float}, default "NaN" + The value to insert to indicate missingness. + + missing_proba : dict of floats or dict of vector of floats + To vary the proportion of values dropped across each feature, + individual drop-probabilities for each feature can be specified as a 1D + array-like of shape (n_features, ) (e.g. [0.1, 0.15, 0.1]). + + If missingness is not MCAR, a dict of floats can be used to specify + the drop-probabilities on a per-label basis + (e.g. {1: 0.2, 2: 0.3, 3: 0.5}). + + This dict can also contains some 1D array-like of shape (n_features, ) + to vary drop-probabilities across features + (e.g. {1: 0.1, 3: [0.1, 0.15, 0.1]}). + + copy : bool, default False + Whether to copy the data or work inplace. + + random_state : int, optional + The seed for the numpy's random number generator. + + If ``random_state`` is set to an integer, the ``missing_proba`` + can be upscaled safely with the assumption that all the values + dropped with a smaller scale will exist in the larger scaled version:: + missing_proba_1 = {0: 0.1, 3: [0.3, 0.1, 0.1]} + missing_proba_2 = {0: 0.1, 1:0.2, 3: [0.6, 0.1, 0.8]} + + The missing values dropped with ``missing_proba_1`` will also + be dropped with ``missing_proba_2``. + + + Examples + -------- + + >>> import numpy as np + >>> X = np.array([[0., 1., 2.], + ... [3., 4., 5.], + ... [6., 7., 8.], + ... [9., 0., 1.], + ... [2., 3., 4.], + ... [8., 9., 8.], + ... [1., 0., 5.], + ... [7., 8., 9.], + ... [5., 4., 3.], + ... [2., 1., 1.], + ... [1., 2., 3.]]) + >>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + >>> # NMAR missingness - + >>> # Drop values from samples of class 1 alone based on the below + >>> # missing_proba hence making it Not Missing At Random missingness. + >>> missing_proba = {1: [0.2, # Drop 20% values from feature 0 for class 0 + ... 0.2, # and class 1 + ... 0]} # Do not drop any values from feature 2 + >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) + >>> vd.transform(X, y) + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., 8.], + [ 9., 0., 1.], + [ 2., 3., 4.], + [ 8., nan, 8.], + [ 1., 0., 5.], + [ nan, 8., 9.], + [ 5., 4., 3.], + [ 2., 1., 1.], + [ 1., 2., 3.]]) + >>> # Increase the missing_proba to add more missing values in feature 0 + >>> # Also add a few missing values in all features for class 0 samples. + >>> missing_proba = {1: [0.4, 0.2, 0], 0: 0.6} + >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) + >>> vd.transform(X, y) + array([[ nan, nan, nan], + [ nan, 4., nan], + [ nan, nan, 8.], + [ 9., 0., 1.], + [ 2., nan, nan], + [ 8., nan, 8.], + [ nan, 0., 5.], + [ nan, 8., 9.], + [ 5., 4., 3.], + [ 2., 1., 1.], + [ 1., 2., 3.]]) + >>> # MCAR missingness - + >>> # 30% of values in each feature Missing Completely At Random + >>> vd = ValueDropper(missing_proba=0.3, random_state=0) + >>> vd.transform(X, y) + array([[ 0., 1., nan], + [ 3., 4., 5.], + [ 6., nan, 8.], + [ 9., 0., 1.], + [ 2., nan, 4.], + [ nan, 9., 8.], + [ nan, nan, 5.], + [ nan, 8., nan], + [ 5., 4., nan], + [ 2., 1., 1.], + [ 1., 2., 3.]]) + >>> # Increase the missing_proba to add more missing values in feature 0 + >>> # and 1 alone. Retain the same drop-probability for feature 2 + >>> # Explicitly set copy=False for inplace dropping of values + >>> vd = ValueDropper(missing_proba=[0.6, 0.8, 0.3], + ... copy=False, random_state=0) + >>> _ = vd.transform(X, y) + >>> X + array([[ 0., 1., nan], + [ nan, nan, 5.], + [ 6., nan, 8.], + [ 9., 0., 1.], + [ nan, nan, 4.], + [ nan, nan, 8.], + [ nan, nan, 5.], + [ nan, nan, nan], + [ 5., nan, nan], + [ nan, nan, 1.], + [ nan, nan, 3.]]) + """ + + def __init__(self, missing_values="NaN", + missing_proba=None, copy=True, random_state=None): + self.missing_values = missing_values + self.missing_proba = missing_proba + self.copy = copy + self.random_state = random_state + + def transform(self, X, y=None): + """Drop values from ``X`` according to the given distribution. + + Parameters + ---------- + + X : array-like of shape (n_features, n_samples) + Data, in which the values must be dropped and set to + ``missing_values``. + + y : array-like, shape = (n_samples,), optional for MCAR + Target relative to X for classification or regression; + When missing_proba is not a dict (for MCAR missingness), + ``y`` need not be passed. + """ + # Validate missing_values and generate missing_mask + if ((isinstance(self.missing_values, str) and + (self.missing_values.lower() == "nan")) or + np.isnan(self.missing_values)): + missing_values = np.nan + else: + missing_values = self.missing_values + + # Don't allow pre-exising missing values in X, to simplify API + X = check_array(X, dtype=('numeric' + if isinstance(missing_values, + (numbers.Integral, np.integer)) + else np.float), + copy=self.copy) + + n_samples, n_features = X.shape + rng = check_random_state(self.random_state) + + # Validate y, and find type of missingness + if isinstance(self.missing_proba, dict): + # For NMAR + # Validate and convert the missing_proba dict into a + # 2D probability distribution along the features and labels + missing_type = 'nmar' + + if y is None: + raise ValueError("The missing_proba is a dict " + "but y is None. If missingness is to be " + "related to the class labels, target class " + "labels (y) must be passed.") + + target_type = type_of_target(y) + if 'continuous' in target_type or 'multioutput' in target_type: + raise ValueError("Value dropping based on the given " + "distribution can be done only for single " + "target which is discrete (classification " + "tasks). The given target (y) is of type %s" + % target_type) + y = check_array(y, ensure_2d=False, dtype='numeric') + + le = LabelEncoder().fit(y) + classes = le.classes_ + n_classes = classes.shape[0] + + drop_probs = np.zeros((n_classes, n_features), dtype=np.float64) + + class_keys, probas = zip(*self.missing_proba.items()) + encoded_class_keys = le.transform(class_keys) + else: + # For MCAR + # Validate and convert the missing_proba dict into a + # 1D probability distribution along the features + missing_type = 'mcar' + + drop_probs = np.zeros((1, n_features), dtype=np.float64) + + # Hack to simplify and unify missing generation code for nmar/mcar + classes = class_keys = encoded_class_keys = (0, ) + probas = (self.missing_proba, ) + y = np.zeros(n_samples) + + # For both nmar/mcar + for encoded_class_key, class_key, proba in zip(encoded_class_keys, + class_keys, probas): + if isinstance(proba, (np.ndarray, list, tuple)): + proba = np.asarray(proba) + if proba.shape[0] != n_features: + raise ValueError("%s shape of the per feature " + "drop-probabilities vector " + "does not conform to the number of " + "features, %d" + % ("For label, %s, the" % class_key + if missing_type == 'nmar' + else "The", n_features)) + elif not isinstance(proba, (np.floating, float, + numbers.Integral, np.integer)): + raise ValueError("%s value must be a float or " + "1D vector (list, tuple or np.ndarray) of " + "shape (n_features,)%s %r was passed." + % ("For label, %s, probability" % class_key + if missing_type == 'nmar' + else 'Probability', + " or dict of floats/1D vectors." + if missing_type == 'mcar' else "", proba)) + + drop_probs[encoded_class_key, :] = proba + + if np.any(drop_probs < 0) or np.any(drop_probs > 1): + raise ValueError("All the individual drop-probabilities should be " + "within the range of [0, 1]. The given " + "missing_proba does not conform to that. %r" + % self.missing_proba) + + # Generate random_states for each feature / label in advance + # This is important to maintain consistency in generated missing values + # for successively increasing missing percent. + random_states = rng.randint(0, np.iinfo(np.int32).max, + drop_probs.shape) + + for i, class_i in enumerate(classes): + samples_mask = (y == class_i) + this_n_samples = samples_mask.sum() + this_block_indices = np.arange(n_samples)[samples_mask] + + for feature in range(n_features): + this_required_n_missing = int(round(drop_probs[i, feature] * + this_n_samples)) + if this_required_n_missing == 0: + continue + + this_rng = check_random_state(random_states[i, feature]) + shuffled_indices = this_rng.permutation(this_block_indices) + + # Drop them + X[shuffled_indices[:this_required_n_missing], + feature] = missing_values + + return X