scikit-learn · raghavrv · Jul 27, 2016 · Nov 21, 2016 · Nov 21, 2016 · Nov 29, 2016
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -303,6 +303,18 @@ Samples generator
    datasets.make_checkerboard
 
 
+Missing Value Generator
+-----------------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   datasets.ValueDropper
+
+
 .. _decomposition_ref:
 
 :mod:`sklearn.decomposition`: Matrix Decomposition

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -35,6 +35,12 @@ New features
      detection based on nearest neighbors.
      :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_.
 
+   - Introduced the :class:`datasets.ValueDropper` transformer to artificially
+     introduce missing values based on per-class or per-feature
+     drop-probabilities (for introducing NMAR missingness) or global
+     drop-probability (for introducing MCAR missingness).
+     :issue:`7084` by `Raghav RV`_.
+
 Enhancements
 ............
 

diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py
@@ -0,0 +1,117 @@
+"""
+================================================================
+Data Pertubation: Generating NMAR / MCAR missing_values in data
+================================================================
+
+This example illustrates how the :class:`sklearn.datasets.ValueDropper` can
+be used to generate missing values completely at random or based on the
+given drop-probabilities.
+
+The :class`sklearn.datasets.ValueDropper` is a transformer which can be
+initialized with a ``missing_proba`` specifying the drop-probabilites
+for each class label (and each feature if needed). This facilitates
+benchmarking missing-value strategies and evaluating the performance of such
+strategies with respect to the type, extent and distribution of missingness in
+the data. Importantly, when ``random_state`` is set to an integer, it
+provisions preserving the drop-locations as the ``missing_proba`` is increased
+to study the effect of the more missing values. This allows benchmarking
+with incremental missing rates without causing variation in the results due to
+an inconsistency in the drop-locations between different scales of
+``missing_proba``.
+
+NMAR or Not Missing At Random refers to the case when the missingness in the
+data is distributed not at random. It is either correlated with the target
+value(s) or with the data itself. In some references it is also refered to as
+MNAR or Missing Not At Random.
+
+MCAR or Missing Completely At Random refers to the case when the missingness
+in the data is completely random and does not correlate with the classification
+target value(s) or the data.
+"""
+# Author: Raghav RV <[email protected]>
+#
+# License: BSD 3 clause
+
+from __future__ import print_function
+
+import numpy as np
+from sklearn.datasets import ValueDropper
+
+print(__doc__)
+
+
+X = np.random.RandomState(0).random_sample((20, 3))
+y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2])
+
+# For samples from class 1, each feature will be missing 20% of its values
+vd = ValueDropper(missing_proba={1: 0.2}, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping 20% of values (per feature) in samples of class 1:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Each feature of samples of class 1 will have a further 20% of its values
+# missing. (Old locations will be preserved as random_state is set)
+vd = ValueDropper(missing_proba={1: 0.4}, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping another 20% of values (per feature) in samples of "
+      "class 1:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop 30% of values in each feature completely at random
+
+vd = ValueDropper(missing_proba=0.3, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping 30% of values randomly:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop values based on the given drop-probabilities -
+
+# For samples of class 0, drop 10% of values (in each feature)
+# For samples of class 2, drop 20% of values in feature 0, 40% in feature 1
+#                         and None in feature 2
+# Don't drop any values for samples of class 1.
+missing_proba = {0: 0.1, 2: [0.2, 0.4, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping one set of missing values based on the "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop twice as many missing values as in previous step.
+missing_proba = {0: 0.2, 2: [0.4, 0.6, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+print("\nAfter dropping another set of missing values based on the new "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop more values and also drop 40% of values from samples of class 1
+# (in each feature)
+missing_proba = {0: 0.3, 1: 0.4, 2: [0.6, 0.8, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+print("\nAfter dropping another set of missing values based on the new "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
@@ -52,6 +52,7 @@
 from .species_distributions import fetch_species_distributions
 from .california_housing import fetch_california_housing
 from .rcv1 import fetch_rcv1
+from .value_dropper import ValueDropper
 
 
 __all__ = ['clear_data_home',
@@ -102,4 +103,5 @@
            'make_sparse_uncorrelated',
            'make_spd_matrix',
            'make_swiss_roll',
-           'mldata_filename']
+           'mldata_filename',
+           'ValueDropper']
diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py
@@ -0,0 +1,232 @@
+import numpy as np
+
+from sklearn.datasets import ValueDropper
+from sklearn.datasets import make_classification, make_regression
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_raise_message
+from sklearn.utils.testing import assert_raises_regexp
+from sklearn.preprocessing import LabelEncoder
+
+
+def test_value_dropper_mnar_clf():
+    # Test drop probabilites when missing distribution is
+    # given for classification problems
+    n_samples, n_features = 1000, 5
+    X, y = make_classification(n_samples=n_samples,
+                               n_classes=4,
+                               n_features=n_features,
+                               n_informative=5,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+    le = LabelEncoder().fit(['a', 'z', 'b', 'j'])
+    y_str = le.inverse_transform(y)
+    y_int = y
+
+    for y in (y_int, y_str):
+        classes = np.unique(y)
+
+        # Inplace dropping of values
+
+        # Samples from class 0 will have a drop-probability of 0.1
+        vd = ValueDropper(missing_proba={classes[0]: 0.1},
+                          missing_values=np.nan, random_state=0)
+        X_dropped = vd.transform(X, y)
+        missing_mask = np.isnan(X_dropped)
+
+        # Check the drop-probabilty for class 0
+        assert_almost_equal(missing_mask[y == classes[0]].sum() /
+                            float(np.sum(y == classes[0]) * n_features), 0.1,
+                            decimal=2)
+
+        # Check drop-probability for samples of class 0
+        assert_almost_equal(
+            missing_mask[y == classes[0]].ravel().sum() /
+            float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2)
+
+        # and no missing values from y != 0
+        assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0)
+
+        # Samples from class 0 will have 50% of values missing in each feature
+        # And samples from class 1 will have a drop-probabilities as specified
+        # by a list of drop-probabilites for each feature
+        missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], classes[1]: 0.5}
+        vd = ValueDropper(missing_proba=missing_proba, missing_values=np.nan,
+                          random_state=0)
+        X_dropped = vd.transform(X, y)
+
+        missing_mask = np.isnan(X_dropped)
+        # Check that there are no missing values when y != {0 or 1}
+        assert_equal(missing_mask[(y == classes[2])].ravel().sum(), 0)
+        assert_equal(missing_mask[(y == classes[3])].ravel().sum(), 0)
+
+        # Check that the drop probabilites for samples of class 1 is 0.5
+        # across all features
+        assert_array_almost_equal(
+            missing_mask[y == classes[1]].sum(axis=0) /
+            float(np.sum(y == classes[1])), [0.5] * n_features, decimal=2)
+
+        # Check that the drop probabilites when class == 0 are as given by
+        # the missing_proba dict
+        assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) /
+                                  float(np.sum(y == classes[0])),
+                                  missing_proba[classes[0]],
+                                  decimal=2)
+
+        # Ensure scaling up the missing_proba retains previously dropped
+        # locations as long as random_state is set
+        # The up scaling need not be linear
+        missing_proba = {classes[0]: [0.1, 0.5, 0.5, 0.1, 0], classes[1]: 0.8}
+        vd = ValueDropper(missing_proba=missing_proba,
+                          missing_values=-100.2, random_state=0)
+        X_dropped2 = vd.transform(X, y)
+        new_missing_mask = X_dropped2 == -100.2
+        assert_true(np.all(new_missing_mask[missing_mask]))
+
+
+def test_value_dropper_mnar_reg_error():
+    X, y = make_regression(n_samples=10, random_state=0)
+
+    assert_raise_message(ValueError,
+                         "only for single target which is discrete"
+                         " (classification tasks). The given target (y) is of "
+                         "type continuous",
+                         ValueDropper(missing_proba={0: 0.2}).transform,
+                         X, y)
+
+
+def check_value_dropper_mcar(X, y):
+    X_copy = X.copy()
+    X_copy2 = X.copy()
+    n_samples, n_features = X.shape
+    n_values = n_samples * n_features
+
+    # Inplace dropping of values; 0 correlation case.
+    # For even indexed features missing drop-probability is 0.3 and
+    # for odd indexed ones 0.1
+    # (Also check if inplace operation works as expected)
+    missing_proba = np.array([0.3, 0.1] * 5)
+    vd = ValueDropper(missing_proba=missing_proba, copy=False, random_state=0)
+    vd.transform(X_copy, y)
+    missing_mask = np.isnan(X_copy)
+
+    global_missing_rate = missing_proba.mean()  # 0.2
+
+    # Check the global missing rate
+    assert_almost_equal(missing_mask.ravel().sum() / float(n_values),
+                        global_missing_rate)
+
+    # Check the rate for all even indexed features
+    even_feature_missing_mask = missing_mask[:, missing_proba == 0.3]
+    assert_almost_equal(even_feature_missing_mask.ravel().sum() /
+                        float(even_feature_missing_mask.size), 0.3)
+
+    # Check the rate for all odd features
+    odd_feature_missing_mask = missing_mask[:, missing_proba == 0.1]
+    assert_almost_equal(odd_feature_missing_mask.ravel().sum() /
+                        float(odd_feature_missing_mask.size), 0.1)
+
+    # Let us drop 0.3 more fraction of values. This time not inplace
+    # copy=True must be default
+    # Check with inf as missing values
+    vd = ValueDropper(missing_proba=0.6, missing_values=np.inf, random_state=0)
+    X_more_dropped = vd.transform(X_copy2, y)
+    new_missing_mask = np.isinf(X_more_dropped)
+
+    # Check global drop probability
+    assert_almost_equal(new_missing_mask.ravel().sum() / float(n_values), 0.6)
+    # Check the drop-probability for a random feature 3
+    assert_almost_equal(new_missing_mask[:, 3].ravel().sum() /
+                        float(n_samples), 0.6)
+
+    # Ensure X is not modified
+    assert_array_almost_equal(X_copy2, X)
+
+    # Ensure all the missing positions that were in the previous step also
+    # exist when missing_proba is scaled up
+    # (Important for reproducibility)
+    assert_true(np.all(new_missing_mask[missing_mask]))
+
+
+def test_value_dropper_mcar():
+    # Test missing fractions for MCAR case in a classification problem
+    n_samples, n_features = 1000, 10
+    X, y_int = make_classification(n_samples=n_samples,
+                                   n_features=n_features, random_state=0)
+    le = LabelEncoder().fit(['a', 'z'])
+    y_str = le.inverse_transform(y_int)
+    for y in (y_str, y_int):
+        check_value_dropper_mcar(X, y)
+
+    # Test missing fractions for MCAR case in a regression problem
+    n_samples, n_features = 1000, 10
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           random_state=0)
+    check_value_dropper_mcar(X, y)
+
+
+def test_value_dropper_errors():
+    n_samples, n_features = 1000, 10
+    X, y = make_classification(n_samples=n_samples,
+                               n_classes=4,
+                               n_features=n_features,
+                               n_informative=5,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    # Raise sensible error when any probability is outside the range [0, 1]
+    missing_probas = (
+        # NMAR cases
+        {0: 2., 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2, }, {0: -2, }, {0: 2.0, },
+        {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01], },
+        # MCAR cases
+        [0, 0, 0, 0.2, 0.3, -0.1, 0, 0, 0, 0.5], 2.5, 1.5,
+        [0, -1, 0, 0, 0, 0, 0, 0, 0, 0], 2, -2)
+    for missing_proba in missing_probas:
+        assert_raise_message(ValueError,
+                             "should be within the range of [0, 1]",
+                             ValueDropper(
+                                 missing_proba=missing_proba).transform, X, y)
+
+    wrong_missing_probas_err_pairs = (
+        # 1D vector with fewer or more than n_feature elements
+        ([0.01, ] * 9, "does not conform to the number of features, 10"),
+        ([0.01, ] * 11, "does not conform to the number of features, 10"),
+
+        # Dict with labels having fewer or more than n_feature elements
+        ({1: [0.01, ] * 9, },
+         "For label, 1, the shape of the per feature drop-probabilities "
+         "vector does not conform to the number of features, 10"),
+
+        ({0: [0.01, ] * 11, 1: [0.01, ] * 10},
+         "For label, 0, the shape of the per feature drop-probabilities "
+         "vector does not conform to the number of features, 10"),
+
+        # Dict having labels not present in y labels
+        ({0: 0.025, 1: [0.0025, ] * 10, 2: 0.025, 3: 0.025, 4: 0.025},
+         "y contains new labels: \[4\]"),
+
+        # Incorrect dict or incorrect value
+        ({0: 'foo', },
+         "For label, 0, probability value must be a float or 1D vector \(list,"
+         " tuple or np.ndarray\) of shape \(n_features,\) \'foo\' was passed"),
+
+        ("foobar",
+         "must be a float or 1D vector \(list, tuple or np.ndarray\)"
+         " of shape \(n_features,\) or dict of floats/1D vectors. "
+         "'foobar' was passed."))
+
+    for missing_proba, err_msg in wrong_missing_probas_err_pairs:
+        assert_raises_regexp(ValueError, err_msg,
+                             ValueDropper(missing_proba=missing_proba)
+                             .transform, X, y)
+
+    # When missing_proba is a dict, but y is not given
+    missing_proba = {0: 0.025}
+    assert_raise_message(
+        ValueError, "The missing_proba is a dict but y is None.",
+        ValueDropper(missing_proba=missing_proba).transform, X)