diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 5b44889bfae2f..bd5084450c79b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -303,6 +303,18 @@ Samples generator
    datasets.make_checkerboard
 
 
+Missing Value Generator
+-----------------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   datasets.ValueDropper
+
+
 .. _decomposition_ref:
 
 :mod:`sklearn.decomposition`: Matrix Decomposition
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index cc481740c96f7..087db61294f23 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -35,6 +35,12 @@ New features
      detection based on nearest neighbors.
      :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_.
 
+   - Introduced the :class:`datasets.ValueDropper` transformer to artificially
+     introduce missing values based on per-class or per-feature
+     drop-probabilities (for introducing NMAR missingness) or global
+     drop-probability (for introducing MCAR missingness).
+     :issue:`7084` by `Raghav RV`_.
+
 Enhancements
 ............
 
diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py
new file mode 100644
index 0000000000000..659d4844e24bf
--- /dev/null
+++ b/examples/datasets/generate_missing_values.py
@@ -0,0 +1,117 @@
+"""
+================================================================
+Data Pertubation: Generating NMAR / MCAR missing_values in data
+================================================================
+
+This example illustrates how the :class:`sklearn.datasets.ValueDropper` can
+be used to generate missing values completely at random or based on the
+given drop-probabilities.
+
+The :class`sklearn.datasets.ValueDropper` is a transformer which can be
+initialized with a ``missing_proba`` specifying the drop-probabilites
+for each class label (and each feature if needed). This facilitates
+benchmarking missing-value strategies and evaluating the performance of such
+strategies with respect to the type, extent and distribution of missingness in
+the data. Importantly, when ``random_state`` is set to an integer, it
+provisions preserving the drop-locations as the ``missing_proba`` is increased
+to study the effect of the more missing values. This allows benchmarking
+with incremental missing rates without causing variation in the results due to
+an inconsistency in the drop-locations between different scales of
+``missing_proba``.
+
+NMAR or Not Missing At Random refers to the case when the missingness in the
+data is distributed not at random. It is either correlated with the target
+value(s) or with the data itself. In some references it is also refered to as
+MNAR or Missing Not At Random.
+
+MCAR or Missing Completely At Random refers to the case when the missingness
+in the data is completely random and does not correlate with the classification
+target value(s) or the data.
+"""
+# Author: Raghav RV <rvraghav93@gmail.com>
+#
+# License: BSD 3 clause
+
+from __future__ import print_function
+
+import numpy as np
+from sklearn.datasets import ValueDropper
+
+print(__doc__)
+
+
+X = np.random.RandomState(0).random_sample((20, 3))
+y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2])
+
+# For samples from class 1, each feature will be missing 20% of its values
+vd = ValueDropper(missing_proba={1: 0.2}, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping 20% of values (per feature) in samples of class 1:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Each feature of samples of class 1 will have a further 20% of its values
+# missing. (Old locations will be preserved as random_state is set)
+vd = ValueDropper(missing_proba={1: 0.4}, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping another 20% of values (per feature) in samples of "
+      "class 1:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop 30% of values in each feature completely at random
+
+vd = ValueDropper(missing_proba=0.3, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping 30% of values randomly:")
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop values based on the given drop-probabilities -
+
+# For samples of class 0, drop 10% of values (in each feature)
+# For samples of class 2, drop 20% of values in feature 0, 40% in feature 1
+#                         and None in feature 2
+# Don't drop any values for samples of class 1.
+missing_proba = {0: 0.1, 2: [0.2, 0.4, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+
+print("\nAfter dropping one set of missing values based on the "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop twice as many missing values as in previous step.
+missing_proba = {0: 0.2, 2: [0.4, 0.6, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+print("\nAfter dropping another set of missing values based on the new "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
+
+# Drop more values and also drop 40% of values from samples of class 1
+# (in each feature)
+missing_proba = {0: 0.3, 1: 0.4, 2: [0.6, 0.8, 0]}
+vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+X_dropped = vd.transform(X, y)
+print("\nAfter dropping another set of missing values based on the new "
+      "missing_proba=%s" % missing_proba)
+print("y", "X", sep="\t")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], X_dropped[i], sep="\t")
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
index 0a8cfc62df537..e4a4f677b6172 100644
--- a/sklearn/datasets/__init__.py
+++ b/sklearn/datasets/__init__.py
@@ -52,6 +52,7 @@
 from .species_distributions import fetch_species_distributions
 from .california_housing import fetch_california_housing
 from .rcv1 import fetch_rcv1
+from .value_dropper import ValueDropper
 
 
 __all__ = ['clear_data_home',
@@ -102,4 +103,5 @@
            'make_sparse_uncorrelated',
            'make_spd_matrix',
            'make_swiss_roll',
-           'mldata_filename']
+           'mldata_filename',
+           'ValueDropper']
diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py
new file mode 100644
index 0000000000000..2b5f231f16306
--- /dev/null
+++ b/sklearn/datasets/tests/test_value_dropper.py
@@ -0,0 +1,232 @@
+import numpy as np
+
+from sklearn.datasets import ValueDropper
+from sklearn.datasets import make_classification, make_regression
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_almost_equal
+from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_raise_message
+from sklearn.utils.testing import assert_raises_regexp
+from sklearn.preprocessing import LabelEncoder
+
+
+def test_value_dropper_mnar_clf():
+    # Test drop probabilites when missing distribution is
+    # given for classification problems
+    n_samples, n_features = 1000, 5
+    X, y = make_classification(n_samples=n_samples,
+                               n_classes=4,
+                               n_features=n_features,
+                               n_informative=5,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+    le = LabelEncoder().fit(['a', 'z', 'b', 'j'])
+    y_str = le.inverse_transform(y)
+    y_int = y
+
+    for y in (y_int, y_str):
+        classes = np.unique(y)
+
+        # Inplace dropping of values
+
+        # Samples from class 0 will have a drop-probability of 0.1
+        vd = ValueDropper(missing_proba={classes[0]: 0.1},
+                          missing_values=np.nan, random_state=0)
+        X_dropped = vd.transform(X, y)
+        missing_mask = np.isnan(X_dropped)
+
+        # Check the drop-probabilty for class 0
+        assert_almost_equal(missing_mask[y == classes[0]].sum() /
+                            float(np.sum(y == classes[0]) * n_features), 0.1,
+                            decimal=2)
+
+        # Check drop-probability for samples of class 0
+        assert_almost_equal(
+            missing_mask[y == classes[0]].ravel().sum() /
+            float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2)
+
+        # and no missing values from y != 0
+        assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0)
+
+        # Samples from class 0 will have 50% of values missing in each feature
+        # And samples from class 1 will have a drop-probabilities as specified
+        # by a list of drop-probabilites for each feature
+        missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], classes[1]: 0.5}
+        vd = ValueDropper(missing_proba=missing_proba, missing_values=np.nan,
+                          random_state=0)
+        X_dropped = vd.transform(X, y)
+
+        missing_mask = np.isnan(X_dropped)
+        # Check that there are no missing values when y != {0 or 1}
+        assert_equal(missing_mask[(y == classes[2])].ravel().sum(), 0)
+        assert_equal(missing_mask[(y == classes[3])].ravel().sum(), 0)
+
+        # Check that the drop probabilites for samples of class 1 is 0.5
+        # across all features
+        assert_array_almost_equal(
+            missing_mask[y == classes[1]].sum(axis=0) /
+            float(np.sum(y == classes[1])), [0.5] * n_features, decimal=2)
+
+        # Check that the drop probabilites when class == 0 are as given by
+        # the missing_proba dict
+        assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) /
+                                  float(np.sum(y == classes[0])),
+                                  missing_proba[classes[0]],
+                                  decimal=2)
+
+        # Ensure scaling up the missing_proba retains previously dropped
+        # locations as long as random_state is set
+        # The up scaling need not be linear
+        missing_proba = {classes[0]: [0.1, 0.5, 0.5, 0.1, 0], classes[1]: 0.8}
+        vd = ValueDropper(missing_proba=missing_proba,
+                          missing_values=-100.2, random_state=0)
+        X_dropped2 = vd.transform(X, y)
+        new_missing_mask = X_dropped2 == -100.2
+        assert_true(np.all(new_missing_mask[missing_mask]))
+
+
+def test_value_dropper_mnar_reg_error():
+    X, y = make_regression(n_samples=10, random_state=0)
+
+    assert_raise_message(ValueError,
+                         "only for single target which is discrete"
+                         " (classification tasks). The given target (y) is of "
+                         "type continuous",
+                         ValueDropper(missing_proba={0: 0.2}).transform,
+                         X, y)
+
+
+def check_value_dropper_mcar(X, y):
+    X_copy = X.copy()
+    X_copy2 = X.copy()
+    n_samples, n_features = X.shape
+    n_values = n_samples * n_features
+
+    # Inplace dropping of values; 0 correlation case.
+    # For even indexed features missing drop-probability is 0.3 and
+    # for odd indexed ones 0.1
+    # (Also check if inplace operation works as expected)
+    missing_proba = np.array([0.3, 0.1] * 5)
+    vd = ValueDropper(missing_proba=missing_proba, copy=False, random_state=0)
+    vd.transform(X_copy, y)
+    missing_mask = np.isnan(X_copy)
+
+    global_missing_rate = missing_proba.mean()  # 0.2
+
+    # Check the global missing rate
+    assert_almost_equal(missing_mask.ravel().sum() / float(n_values),
+                        global_missing_rate)
+
+    # Check the rate for all even indexed features
+    even_feature_missing_mask = missing_mask[:, missing_proba == 0.3]
+    assert_almost_equal(even_feature_missing_mask.ravel().sum() /
+                        float(even_feature_missing_mask.size), 0.3)
+
+    # Check the rate for all odd features
+    odd_feature_missing_mask = missing_mask[:, missing_proba == 0.1]
+    assert_almost_equal(odd_feature_missing_mask.ravel().sum() /
+                        float(odd_feature_missing_mask.size), 0.1)
+
+    # Let us drop 0.3 more fraction of values. This time not inplace
+    # copy=True must be default
+    # Check with inf as missing values
+    vd = ValueDropper(missing_proba=0.6, missing_values=np.inf, random_state=0)
+    X_more_dropped = vd.transform(X_copy2, y)
+    new_missing_mask = np.isinf(X_more_dropped)
+
+    # Check global drop probability
+    assert_almost_equal(new_missing_mask.ravel().sum() / float(n_values), 0.6)
+    # Check the drop-probability for a random feature 3
+    assert_almost_equal(new_missing_mask[:, 3].ravel().sum() /
+                        float(n_samples), 0.6)
+
+    # Ensure X is not modified
+    assert_array_almost_equal(X_copy2, X)
+
+    # Ensure all the missing positions that were in the previous step also
+    # exist when missing_proba is scaled up
+    # (Important for reproducibility)
+    assert_true(np.all(new_missing_mask[missing_mask]))
+
+
+def test_value_dropper_mcar():
+    # Test missing fractions for MCAR case in a classification problem
+    n_samples, n_features = 1000, 10
+    X, y_int = make_classification(n_samples=n_samples,
+                                   n_features=n_features, random_state=0)
+    le = LabelEncoder().fit(['a', 'z'])
+    y_str = le.inverse_transform(y_int)
+    for y in (y_str, y_int):
+        check_value_dropper_mcar(X, y)
+
+    # Test missing fractions for MCAR case in a regression problem
+    n_samples, n_features = 1000, 10
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           random_state=0)
+    check_value_dropper_mcar(X, y)
+
+
+def test_value_dropper_errors():
+    n_samples, n_features = 1000, 10
+    X, y = make_classification(n_samples=n_samples,
+                               n_classes=4,
+                               n_features=n_features,
+                               n_informative=5,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    # Raise sensible error when any probability is outside the range [0, 1]
+    missing_probas = (
+        # NMAR cases
+        {0: 2., 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2, }, {0: -2, }, {0: 2.0, },
+        {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01], },
+        # MCAR cases
+        [0, 0, 0, 0.2, 0.3, -0.1, 0, 0, 0, 0.5], 2.5, 1.5,
+        [0, -1, 0, 0, 0, 0, 0, 0, 0, 0], 2, -2)
+    for missing_proba in missing_probas:
+        assert_raise_message(ValueError,
+                             "should be within the range of [0, 1]",
+                             ValueDropper(
+                                 missing_proba=missing_proba).transform, X, y)
+
+    wrong_missing_probas_err_pairs = (
+        # 1D vector with fewer or more than n_feature elements
+        ([0.01, ] * 9, "does not conform to the number of features, 10"),
+        ([0.01, ] * 11, "does not conform to the number of features, 10"),
+
+        # Dict with labels having fewer or more than n_feature elements
+        ({1: [0.01, ] * 9, },
+         "For label, 1, the shape of the per feature drop-probabilities "
+         "vector does not conform to the number of features, 10"),
+
+        ({0: [0.01, ] * 11, 1: [0.01, ] * 10},
+         "For label, 0, the shape of the per feature drop-probabilities "
+         "vector does not conform to the number of features, 10"),
+
+        # Dict having labels not present in y labels
+        ({0: 0.025, 1: [0.0025, ] * 10, 2: 0.025, 3: 0.025, 4: 0.025},
+         "y contains new labels: \[4\]"),
+
+        # Incorrect dict or incorrect value
+        ({0: 'foo', },
+         "For label, 0, probability value must be a float or 1D vector \(list,"
+         " tuple or np.ndarray\) of shape \(n_features,\) \'foo\' was passed"),
+
+        ("foobar",
+         "must be a float or 1D vector \(list, tuple or np.ndarray\)"
+         " of shape \(n_features,\) or dict of floats/1D vectors. "
+         "'foobar' was passed."))
+
+    for missing_proba, err_msg in wrong_missing_probas_err_pairs:
+        assert_raises_regexp(ValueError, err_msg,
+                             ValueDropper(missing_proba=missing_proba)
+                             .transform, X, y)
+
+    # When missing_proba is a dict, but y is not given
+    missing_proba = {0: 0.025}
+    assert_raise_message(
+        ValueError, "The missing_proba is a dict but y is None.",
+        ValueDropper(missing_proba=missing_proba).transform, X)
diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py
new file mode 100644
index 0000000000000..29ca635d131ad
--- /dev/null
+++ b/sklearn/datasets/value_dropper.py
@@ -0,0 +1,295 @@
+# Author : Raghav RV <rvraghav93@gmail.com>
+#
+# Licence : BSD 3 clause
+
+import numpy as np
+import numbers
+
+from sklearn.utils import check_array
+from sklearn.utils import check_random_state
+from sklearn.utils.multiclass import type_of_target
+
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import LabelEncoder
+
+
+__all__ = ["ValueDropper"]
+
+
+class ValueDropper(TransformerMixin):
+    """Artificially insert NMAR or MCAR missing values into data.
+
+    Where,
+
+    NMAR/MNAR - Not Missing At Random / Missing Not At Random
+        When the missingness is correlated with the class classes in the
+        target (y) (and hence informative).
+
+    MCAR - Missing Completely At Random
+        When the missingness is completely random (and hence uninformative).
+
+    If the missing type is NMAR, a ``missing_proba`` parameter can be passed
+    to drop values conforming to the given drop-probabilities.
+
+
+    Parameters
+    ----------
+
+    missing_values : {"NaN" (or np.nan) | int | float}, default "NaN"
+        The value to insert to indicate missingness.
+
+    missing_proba : dict of floats or dict of vector of floats
+        To vary the proportion of values dropped across each feature,
+        individual drop-probabilities for each feature can be specified as a 1D
+        array-like of shape (n_features, ) (e.g. [0.1, 0.15, 0.1]).
+
+        If missingness is not MCAR, a dict of floats can be used to specify
+        the drop-probabilities on a per-label basis
+        (e.g. {1: 0.2, 2: 0.3, 3: 0.5}).
+
+        This dict can also contains some 1D array-like of shape (n_features, )
+        to vary drop-probabilities across features
+        (e.g. {1: 0.1, 3: [0.1, 0.15, 0.1]}).
+
+    copy : bool, default False
+        Whether to copy the data or work inplace.
+
+    random_state : int, optional
+        The seed for the numpy's random number generator.
+
+        If ``random_state`` is set to an integer, the ``missing_proba``
+        can be upscaled safely with the assumption that all the values
+        dropped with a smaller scale will exist in the larger scaled version::
+            missing_proba_1 = {0: 0.1, 3: [0.3, 0.1, 0.1]}
+            missing_proba_2 = {0: 0.1, 1:0.2, 3: [0.6, 0.1, 0.8]}
+
+        The missing values dropped with ``missing_proba_1`` will also
+        be dropped with ``missing_proba_2``.
+
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> X = np.array([[0., 1., 2.],
+    ...               [3., 4., 5.],
+    ...               [6., 7., 8.],
+    ...               [9., 0., 1.],
+    ...               [2., 3., 4.],
+    ...               [8., 9., 8.],
+    ...               [1., 0., 5.],
+    ...               [7., 8., 9.],
+    ...               [5., 4., 3.],
+    ...               [2., 1., 1.],
+    ...               [1., 2., 3.]])
+    >>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+    >>> # NMAR missingness -
+    >>> # Drop values from samples of class 1 alone based on the below
+    >>> # missing_proba hence making it Not Missing At Random missingness.
+    >>> missing_proba = {1: [0.2,  # Drop 20% values from feature 0 for class 0
+    ...                      0.2,  # and class 1
+    ...                      0]}   # Do not drop any values from feature 2
+    >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+    >>> vd.transform(X, y)
+    array([[  0.,   1.,   2.],
+           [  3.,   4.,   5.],
+           [  6.,   7.,   8.],
+           [  9.,   0.,   1.],
+           [  2.,   3.,   4.],
+           [  8.,  nan,   8.],
+           [  1.,   0.,   5.],
+           [ nan,   8.,   9.],
+           [  5.,   4.,   3.],
+           [  2.,   1.,   1.],
+           [  1.,   2.,   3.]])
+    >>> # Increase the missing_proba to add more missing values in feature 0
+    >>> # Also add a few missing values in all features for class 0 samples.
+    >>> missing_proba = {1: [0.4, 0.2, 0], 0: 0.6}
+    >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0)
+    >>> vd.transform(X, y)
+    array([[ nan,  nan,  nan],
+           [ nan,   4.,  nan],
+           [ nan,  nan,   8.],
+           [  9.,   0.,   1.],
+           [  2.,  nan,  nan],
+           [  8.,  nan,   8.],
+           [ nan,   0.,   5.],
+           [ nan,   8.,   9.],
+           [  5.,   4.,   3.],
+           [  2.,   1.,   1.],
+           [  1.,   2.,   3.]])
+    >>> # MCAR missingness -
+    >>> # 30% of values in each feature Missing Completely At Random
+    >>> vd = ValueDropper(missing_proba=0.3, random_state=0)
+    >>> vd.transform(X, y)
+    array([[  0.,   1.,  nan],
+           [  3.,   4.,   5.],
+           [  6.,  nan,   8.],
+           [  9.,   0.,   1.],
+           [  2.,  nan,   4.],
+           [ nan,   9.,   8.],
+           [ nan,  nan,   5.],
+           [ nan,   8.,  nan],
+           [  5.,   4.,  nan],
+           [  2.,   1.,   1.],
+           [  1.,   2.,   3.]])
+    >>> # Increase the missing_proba to add more missing values in feature 0
+    >>> # and 1 alone. Retain the same drop-probability for feature 2
+    >>> # Explicitly set copy=False for inplace dropping of values
+    >>> vd = ValueDropper(missing_proba=[0.6, 0.8, 0.3],
+    ...                   copy=False, random_state=0)
+    >>> _ = vd.transform(X, y)
+    >>> X
+    array([[  0.,   1.,  nan],
+           [ nan,  nan,   5.],
+           [  6.,  nan,   8.],
+           [  9.,   0.,   1.],
+           [ nan,  nan,   4.],
+           [ nan,  nan,   8.],
+           [ nan,  nan,   5.],
+           [ nan,  nan,  nan],
+           [  5.,  nan,  nan],
+           [ nan,  nan,   1.],
+           [ nan,  nan,   3.]])
+    """
+
+    def __init__(self, missing_values="NaN",
+                 missing_proba=None, copy=True, random_state=None):
+        self.missing_values = missing_values
+        self.missing_proba = missing_proba
+        self.copy = copy
+        self.random_state = random_state
+
+    def transform(self, X, y=None):
+        """Drop values from ``X`` according to the given distribution.
+
+        Parameters
+        ----------
+
+        X : array-like of shape (n_features, n_samples)
+            Data, in which the values must be dropped and set to
+            ``missing_values``.
+
+        y : array-like, shape = (n_samples,), optional for MCAR
+            Target relative to X for classification or regression;
+            When missing_proba is not a dict (for MCAR missingness),
+            ``y`` need not be passed.
+        """
+        # Validate missing_values and generate missing_mask
+        if ((isinstance(self.missing_values, str) and
+                (self.missing_values.lower() == "nan")) or
+                np.isnan(self.missing_values)):
+            missing_values = np.nan
+        else:
+            missing_values = self.missing_values
+
+        # Don't allow pre-exising missing values in X, to simplify API
+        X = check_array(X, dtype=('numeric'
+                                  if isinstance(missing_values,
+                                                (numbers.Integral, np.integer))
+                                  else np.float),
+                        copy=self.copy)
+
+        n_samples, n_features = X.shape
+        rng = check_random_state(self.random_state)
+
+        # Validate y, and find type of missingness
+        if isinstance(self.missing_proba, dict):
+            # For NMAR
+            # Validate and convert the missing_proba dict into a
+            # 2D probability distribution along the features and labels
+            missing_type = 'nmar'
+
+            if y is None:
+                raise ValueError("The missing_proba is a dict "
+                                 "but y is None. If missingness is to be "
+                                 "related to the class labels, target class "
+                                 "labels (y) must be passed.")
+
+            target_type = type_of_target(y)
+            if 'continuous' in target_type or 'multioutput' in target_type:
+                raise ValueError("Value dropping based on the given "
+                                 "distribution can be done only for single "
+                                 "target which is discrete (classification "
+                                 "tasks). The given target (y) is of type %s"
+                                 % target_type)
+            y = check_array(y, ensure_2d=False, dtype='numeric')
+
+            le = LabelEncoder().fit(y)
+            classes = le.classes_
+            n_classes = classes.shape[0]
+
+            drop_probs = np.zeros((n_classes, n_features), dtype=np.float64)
+
+            class_keys, probas = zip(*self.missing_proba.items())
+            encoded_class_keys = le.transform(class_keys)
+        else:
+            # For MCAR
+            # Validate and convert the missing_proba dict into a
+            # 1D probability distribution along the features
+            missing_type = 'mcar'
+
+            drop_probs = np.zeros((1, n_features), dtype=np.float64)
+
+            # Hack to simplify and unify missing generation code for nmar/mcar
+            classes = class_keys = encoded_class_keys = (0, )
+            probas = (self.missing_proba, )
+            y = np.zeros(n_samples)
+
+        # For both nmar/mcar
+        for encoded_class_key, class_key, proba in zip(encoded_class_keys,
+                                                       class_keys, probas):
+            if isinstance(proba, (np.ndarray, list, tuple)):
+                proba = np.asarray(proba)
+                if proba.shape[0] != n_features:
+                    raise ValueError("%s shape of the per feature "
+                                     "drop-probabilities vector "
+                                     "does not conform to the number of "
+                                     "features, %d"
+                                     % ("For label, %s, the" % class_key
+                                        if missing_type == 'nmar'
+                                        else "The", n_features))
+            elif not isinstance(proba, (np.floating, float,
+                                        numbers.Integral, np.integer)):
+                raise ValueError("%s value must be a float or "
+                                 "1D vector (list, tuple or np.ndarray) of "
+                                 "shape (n_features,)%s %r was passed."
+                                 % ("For label, %s, probability" % class_key
+                                    if missing_type == 'nmar'
+                                    else 'Probability',
+                                    " or dict of floats/1D vectors."
+                                    if missing_type == 'mcar' else "", proba))
+
+            drop_probs[encoded_class_key, :] = proba
+
+        if np.any(drop_probs < 0) or np.any(drop_probs > 1):
+            raise ValueError("All the individual drop-probabilities should be "
+                             "within the range of [0, 1]. The given "
+                             "missing_proba does not conform to that. %r"
+                             % self.missing_proba)
+
+        # Generate random_states for each feature / label in advance
+        # This is important to maintain consistency in generated missing values
+        # for successively increasing missing percent.
+        random_states = rng.randint(0, np.iinfo(np.int32).max,
+                                    drop_probs.shape)
+
+        for i, class_i in enumerate(classes):
+            samples_mask = (y == class_i)
+            this_n_samples = samples_mask.sum()
+            this_block_indices = np.arange(n_samples)[samples_mask]
+
+            for feature in range(n_features):
+                this_required_n_missing = int(round(drop_probs[i, feature] *
+                                                    this_n_samples))
+                if this_required_n_missing == 0:
+                    continue
+
+                this_rng = check_random_state(random_states[i, feature])
+                shuffled_indices = this_rng.permutation(this_block_indices)
+
+                # Drop them
+                X[shuffled_indices[:this_required_n_missing],
+                  feature] = missing_values
+
+        return X