From 9ec9512b2af7dc102916dceea3ff6f65a73e08be Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Wed, 27 Jul 2016 17:06:50 +0200 Subject: [PATCH 01/16] ENH Add ValueDropper to artificially insert missing values of MCAR or NMAR Squashed Commits ================ Use sep='\t' Use permutation to make tests pass on older numpy; label --> class label --> class drop_fraction --> missing_rate ENH Simplify the API by accepting a multinomial distribution for missingness FIX default missing_values is 'NaN' Don't set instance vars in transform Simplify API even more; Use only missing_distribution Modify example TST Modify tests for new API Part 1 Modify tests and make them pass copy must be True by default for transformers? Bugfixes to pass travis Simplify API - Preserve missing locations when scaled up and disallow existing missing Ensure the old drop locations are preserved. FIX doctest np's random choice is not available for np < 1.7 Add tests for ValueError-s DOC move randomization doc to random_state. Move NMAR/MCAR to class desc. FIX precision error in numpy 2.6 use np.finfo(float).eps to avoid precision errors; regexp err matching Modify and clean up example simplify MCAR/NMAR Move helper out of transform Bind the helper to the class itself DOC Add whatsnew Remove missing mask completely now that we don't allow previous nan TST Fix tests Use explicit 'Ellipsis' to select all samples TST for string labels in y too --- doc/modules/classes.rst | 14 +- doc/whats_new.rst | 4 + examples/datasets/generate_missing_values.py | 143 +++++++ sklearn/datasets/__init__.py | 4 +- sklearn/datasets/tests/test_value_dropper.py | 262 ++++++++++++ sklearn/datasets/value_dropper.py | 394 +++++++++++++++++++ 6 files changed, 819 insertions(+), 2 deletions(-) create mode 100644 examples/datasets/generate_missing_values.py create mode 100644 sklearn/datasets/tests/test_value_dropper.py create mode 100644 sklearn/datasets/value_dropper.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index e7585823cd2dc..4d261bbdd015d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -303,6 +303,18 @@ Samples generator datasets.make_checkerboard +Missing Value Generator +----------------------- + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + datasets.ValueDropper + + .. _decomposition_ref: :mod:`sklearn.decomposition`: Matrix Decomposition @@ -1418,4 +1430,4 @@ To be removed in 0.20 cross_validation.cross_val_score cross_validation.check_cv cross_validation.permutation_test_score - cross_validation.train_test_split \ No newline at end of file + cross_validation.train_test_split diff --git a/doc/whats_new.rst b/doc/whats_new.rst index fef9ec8b72d9f..920983b2dca3a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -35,6 +35,10 @@ New features detection based on nearest neighbors. :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. + - Introduced the :class:`datasets.ValueDropper` transformer to artificially + introduce missing values conforming to a given distribution (NMAR) or + completely at random (MCAR). :issue:`7084` by `Raghav RV`_. + Enhancements ............ diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py new file mode 100644 index 0000000000000..26e104bb22eec --- /dev/null +++ b/examples/datasets/generate_missing_values.py @@ -0,0 +1,143 @@ +""" +============================================= +Generating NMAR / MCAR missing_values in data +============================================= + +This example illustrates how the :class:`sklearn.datasets.ValueDropper` can +be used to generate missing values completely at random or conforming to the +given distribution. + +The :class`sklearn.datasets.ValueDropper` is a transformer which can be +initialized with a ``missing_distribution`` specifying the drop probabilites +for each label (and each feature if needed). It provisions preserving the +missing values of lower scaled ``missing_distribution`` in a higher scaled +``missing_distribution``. This facilitates benchmarking missing-value +strategies and evaluating the performance of such strategies with +respect to the type and extent of missingness in data. + +It allows benchmarking with incremental missing rates (fraction of missing +values to total number of values) without introducing a mismatch in the +missing positions for previous lower rates of missing values. + +NMAR or Not Missing At Random refers to the case when the missingness in the +data is distributed not at random. It is either correlated with the target +value(s) or with the data itself. + +MCAR or Missing Completely At Random refers to the case when the missingness +in the data is completely random and does not correlate with the classification +target value(s) or the data. + +In some references NMAR is sometimes referred to as MNAR (Missing Not At +Random). +""" +# Author: Raghav RV +# +# License: BSD 3 clause + +from __future__ import print_function + +import numpy as np +from sklearn.datasets import ValueDropper + +print(__doc__) + + +X = np.array([[0, 1, 2], + [3, 4, 5], + [6, 7, 8], + [9, 0, 1], + [2, 3, 4], + [8, 9, 8], + [8, 9, 8], + [1, 0, 5], + [5, 4, 3], + [2, 1, 1], + [3, 4, 5], + [2, 3, 4], + [8, 9, 8], + [7, 8, 9]], dtype=float) +y = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2]) + +# Drop 10% of values across all features, where all missing values +# come from samples of class 1 + +vd = ValueDropper(missing_distribution={1: 0.1}, random_state=42) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping 10% of values when class label(s) are 1") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop another 10% of values across all features, where all missing values +# come from samples of class 1 + +vd = ValueDropper(missing_distribution={1: 0.2}, random_state=42) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping another 10% of values when class label(s) are 1") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop 30% of values completely at random + +vd = ValueDropper(missing_distribution=0.3, random_state=42) +X_dropped = vd.transform(X, y) + +print("\nAfter dropping 30% of values randomly") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# Drop 30% of values but according to the given probability distribution +# Incrementally adding 10% each time + +# 40% of the dropped values must be from class label 0 +# (evenly across all features) +# The rest 60% of the dropped values are from class label 1, distributed in the +# 1:2:0 ratio amongst the features. +# Don't drop any values from samples of class 2 +abs_missing_rate = 0.1 +missing_distribution = {0: 0.4 * abs_missing_rate, + 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} + +# Also let's use -1 to denote missing values, this time +vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, + random_state=42) +X_dropped = vd.transform(X, y) + +print("The given class wise distribution is %s " % missing_distribution) +print("\nAfter dropping 10% of values according to the distribution") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +# NOTE that the relative values of the distribution must not be changed. +abs_missing_rate = 0.2 +missing_distribution = {0: 0.4 * abs_missing_rate, + 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} +vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, + random_state=42) +X_dropped = vd.transform(X, y) +print("\nAfter dropping another 10% of values according to the distribution") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") + +abs_missing_rate = 0.3 +missing_distribution = {0: 0.3 * abs_missing_rate, + 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} +vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, + random_state=42) +X_dropped = vd.transform(X, y) +print("\nAfter dropping another 10% of values according to the distribution") +print("y", "X", sep="\t") +print("------------------------") +for i in range(y.shape[0]): + print(y[i], X_dropped[i], sep="\t") diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 0a8cfc62df537..e4a4f677b6172 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -52,6 +52,7 @@ from .species_distributions import fetch_species_distributions from .california_housing import fetch_california_housing from .rcv1 import fetch_rcv1 +from .value_dropper import ValueDropper __all__ = ['clear_data_home', @@ -102,4 +103,5 @@ 'make_sparse_uncorrelated', 'make_spd_matrix', 'make_swiss_roll', - 'mldata_filename'] + 'mldata_filename', + 'ValueDropper'] diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py new file mode 100644 index 0000000000000..ad020a3600908 --- /dev/null +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -0,0 +1,262 @@ +import numpy as np + +from sklearn.datasets import ValueDropper +from sklearn.datasets import make_classification, make_regression +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_raise_message +from sklearn.utils.testing import assert_raises_regexp +from sklearn.preprocessing import LabelEncoder + + +def test_value_dropper_mnar_clf(): + # Test drop probabilites when missing distribution is + # given for classification problems + n_samples, n_features = 1000, 5 + n_values = n_samples * n_features + X, y = make_classification(n_samples=n_samples, + n_classes=4, + n_features=n_features, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=0) + le = LabelEncoder().fit(['a', 'z', 'b', 'j']) + y_str = le.inverse_transform(y) + y_int = y + + for y in (y_int, y_str): + classes = np.unique(y) + + # Inplace dropping of values + + # Samples from class 0 will have a drop probability of 0.1 + vd = ValueDropper(missing_distribution={classes[0]: 0.1}, + missing_values=np.nan, random_state=0) + X_dropped = vd.transform(X, y) + + # Check the total drop fraction + assert_almost_equal(np.isnan(X_dropped).ravel().sum() / + float(n_values), 0.1) + + # All the missing values are from y == 0 + assert_almost_equal(np.isnan(X_dropped[y == classes[0]]).ravel().sum() / + float(n_values), 0.1) + + # and no missing values from y != 0 + assert_almost_equal(np.isnan(X_dropped[y != classes[0]]).ravel().sum() / + float(n_values), 0.) + + # Samples from class 0 will have a drop probabilty of 0.3 + # but spread unevenly across features as given by the + # list of probabilities + # And samples from class 1 will have a drop probability of 0.01 + # across all features + + missing_distribution = {classes[0]: [0.01, 0.005, 0.005, 0, 0], + classes[1]: 0.05} + vd = ValueDropper(missing_distribution=missing_distribution, + missing_values=np.nan, + random_state=0) + X_dropped = vd.transform(X, y) + + missing_mask = np.isnan(X_dropped) + + # Check that there are no missing values when y != {0 or 1} + assert_equal(missing_mask[(y == classes[2])].ravel().sum(), 0) + assert_equal(missing_mask[(y == classes[3])].ravel().sum(), 0) + + # Check that the drop probabilites when class == 1 is 0.1 + # across all features + assert_equal(missing_mask[y == classes[1]].ravel().sum() / + float(n_values), 0.05) + + # Check that the drop probabilites when class == 0 is 2.1 + # across all features sum(missing_distribution[0]) + assert_equal(missing_mask[y == classes[0]].ravel().sum() / + float(n_values), 0.02) + + # Check that the features indexed 3 and 4 have no missing values + # for class 0 + assert_equal(missing_mask[y == classes[0]][3, 4].ravel().sum(), 0.) + + # Check that feature indexed 0 has drop prob 0.2 + assert_equal(missing_mask[np.where(y == classes[0])[0], + (0,)].ravel().sum() / + float(n_values), 0.01) + + # Check that feature indexed 1 and 2 both have drop prob 0.05 + # Check that feature indexed 0 has drop prob 0.2 + assert_equal(missing_mask[np.where(y == classes[0])[0], + (1,)].ravel().sum() / + float(n_values), 0.005) + assert_equal(missing_mask[np.where(y == classes[0])[0], + (2,)].ravel().sum() / + float(n_values), 0.005) + + # Ensure scaling the missing_distribution by a factor of 2 + missing_distribution = {classes[0]: [0.02, 0.01, 0.01, 0, 0], + classes[1]: 0.1} + vd = ValueDropper(missing_distribution=missing_distribution, + missing_values=np.nan, random_state=0) + X_dropped2 = vd.transform(X, y) + assert_true(np.all(np.isnan(X_dropped2[np.isnan(X_dropped)]))) + + +def test_value_dropper_mnar_reg_error(): + X, y = make_regression(n_samples=10, random_state=0) + + assert_raise_message(ValueError, + "only for single target which is discrete" + " (classification tasks). The given target (y) is of " + "type continuous", + ValueDropper(missing_distribution={0: 0.2}).transform, + X, y) + + +def check_value_dropper_mcar(X, y): + X_copy = X.copy() + X_copy2 = X.copy() + n_samples, n_features = X.shape + n_values = n_samples * n_features + + # Inplace dropping of values; 0 correlation case. + # For even indexed features missing probability is 0.03 and + # for odd indexed ones 0.01 + missing_distribution = np.array([0.03, 0.01] * 5) + vd = ValueDropper(missing_distribution=missing_distribution, + copy=False, random_state=0) + vd.transform(X_copy, y) + missing_mask = np.isnan(X_copy) + + global_missing_rate = missing_distribution.sum() + + # Check the global missing rate + assert_almost_equal(missing_mask.ravel().sum() / float(n_values), + global_missing_rate) + + # Check the rate for all even indexed features + assert_almost_equal(missing_mask[:, missing_distribution == 0.03] + .ravel().sum() / float(n_values), + 0.03 * 5) + + # Check the rate for one even indexed feature + assert_almost_equal(missing_mask[:, 0] + .ravel().sum() / float(n_values), 0.03) + + # Check the rate for all odd features + assert_almost_equal(missing_mask[:, missing_distribution == 0.03] + .ravel().sum() / float(n_values), + 0.03 * 5) + + # Check the rate for one odd indexed feature + assert_almost_equal(missing_mask[:, 1] + .ravel().sum() / float(n_values), 0.01) + + # Let us drop 0.3 more fraction of values. This time not inplace + # copy=True must be default + vd = ValueDropper(missing_distribution=0.6, random_state=0) + X_more_dropped = vd.transform(X_copy2, y) + new_missing_mask = np.isnan(X_more_dropped) + + # Ensure X is not modified + assert_array_almost_equal(X_copy2, X) + + # Ensure all the missing positions that were in the previous step also + # exist when missing_distribution is scaled up + # (Important for reproducibility) + assert_true(np.all(new_missing_mask[missing_mask])) + + +def test_value_dropper_mcar(): + # Test missing fractions for MCAR case in a classification problem + n_samples, n_features = 1000, 10 + X, y_int = make_classification(n_samples=n_samples, + n_features=n_features, random_state=0) + le = LabelEncoder().fit(['a', 'z']) + y_str = le.inverse_transform(y_int) + for y in (y_str, y_int): + check_value_dropper_mcar(X, y) + + # Test missing fractions for MCAR case in a regression problem + n_samples, n_features = 1000, 10 + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=0) + check_value_dropper_mcar(X, y) + + +def test_value_dropper_errors(): + n_samples, n_features = 1000, 10 + X, y = make_classification(n_samples=n_samples, + n_classes=4, + n_features=n_features, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=0) + + # Raise sensible error when sum of all probabilites in missing_distribution + # exceeds or equals 1 + missing_distributions = ( + # NMAR cases + {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2., }, + {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, 0.01], 1: 0.25, 2: [0.025, ] * 10, + 3: 0.25}, {0: [0.1] * 10, }, {0: 0.26, 1: [0.09, ] * 10}, + # MCAR cases + [0, 0, 0, 0.2, 0.3, 0.1, 0, 0, 0, 0.5], 2.5, 1.5, + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + for missing_distribution in missing_distributions: + assert_raise_message(ValueError, "should sum up to less than 1", + ValueDropper( + missing_distribution=missing_distribution) + .transform, X, y) + + # Accept only float values 0, 1, 2 all are incorrect values even as float + # hence no point in accepting any int values + assert_raise_message(ValueError, + "missing_distribution must be a float or 1D vector", + ValueDropper(missing_distribution=2).transform, X, y) + assert_raise_message(ValueError, + "should either be a single float or an array", + ValueDropper(missing_distribution={0: 2}) + .transform, X, y) + + wrong_missing_distributions_err_pairs = ( + # 1D vector with fewer or more than n_feature elements + ([0.01, ] * 9, "does not conform to the number of features, 10"), + ([0.01, ] * 11, "does not conform to the number of features, 10"), + + # Dict with labels having fewer or more than n_feature elements + ({1: [0.01, ] * 9, }, + "for label, 1, does not conform to the number of features, 10"), + + ({0: [0.01, ] * 10, 1: [0.01, ] * 11}, + "for label, 1, does not conform to the number of features, 10"), + + # Dict having labels not present in y labels + ({0: 0.025, 1: [0.0025, ] * 10, 2: 0.025, 3: 0.025, 4: 0.025}, + "y contains new labels: \[4\]"), + + # Incorrect dict or incorrect value + ({0: {1: 0.2}, }, + "either be a single float or an array of shape \(n_features,\). " + "\{1: 0.2.*\} was passed for class label 0"), + + ("foobar", + "must be a float or 1D vector \(list, tuple or np.ndarray\)" + " of shape \(n_features,\) or dict")) + + missing_distribution = {0: 0.025, 1: 0.025, 2: 0.025, 3: 0.025} + for missing_distribution, err_msg in wrong_missing_distributions_err_pairs: + assert_raises_regexp(ValueError, err_msg, + ValueDropper( + missing_distribution=missing_distribution) + .transform, X, y) + + # When missing_distribution is a dict, but y is not given + assert_raise_message(ValueError, "", + ValueDropper( + missing_distribution=missing_distribution) + .transform, X) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py new file mode 100644 index 0000000000000..e04e499a6c2d7 --- /dev/null +++ b/sklearn/datasets/value_dropper.py @@ -0,0 +1,394 @@ +# Author : Raghav RV +# +# Licence : BSD 3 clause + +import numpy as np +import numbers + +from sklearn.utils import check_array +from sklearn.utils import check_random_state +from sklearn.utils.multiclass import type_of_target + +from sklearn.base import TransformerMixin +from sklearn.preprocessing import LabelEncoder + + +__all__ = ["ValueDropper"] + + +class ValueDropper(TransformerMixin): + """Artificially insert NMAR or MCAR missing values into data. + + Where, + + NMAR/MNAR - Not Missing At Random / Missing Not At Random + When the missingness is correlated with the class classes in the + target (y) (and hence informative). + + MCAR - Missing Completely At Random + When the missingness is completely random (and hence uninformative). + + If the missing type is NMAR, a ``missing_distribution`` + parameter can be passed to drop values conforming to that distribution. + + + Parameters + ---------- + + missing_values : {"NaN" (or np.nan) | int}, default "NaN" + The value to insert to indicate missingness. + + + missing_distribution : dict of floats or dict of vector of floats + If ``missing_distribution`` is a float within range [0, 1), + it represents the absolute fraction of values that will be missing:: + 0.2 + + There will be ``0.2 * n_samples * n_features`` numbers of missing + values in the data. + + Alternatively this refers to the probability of a value being dropped + after transform. The values are dropped (approximately) uniformly + across all labels and features. This type of missingness is referred + to as MCAR. + + To vary the distribution across features or to prevent a feature from + having missing values, individual probabilities for each feature can + be specified as a 1D vector of shape ``(n_features,)``:: + [0.2, 0.2, 0] + + For the above example, the probability that a sample will have missing + value in feature 0 is 0.2. In other words, after calling + ``transform``, there are ``0.2 * n_samples`` randomly selected samples + with value missing in feature 0 and 1 but no samples with + missing values in feature 2. + + If missingness is not MCAR, ``missing_distribution`` can be used + to specify the multinomial distribution of the newly dropped values + across labels (and if needed across features) as given below. + + If ``missing_distribution`` is a dict of floats:: + {1: 0.02, 2: 0.03, 3: 0.05} + + The probability that a sample from class 1 will have a missing value + is 0.02. + + The missing values are evenly spread across all the features. + + In other words, there are ``int(0.02 / n_features * n_samples)`` + randomly chosen samples of class 1 having missing values in each + feature. + + If there are fewer than ``int(0.02 / n_features * n_samples)`` numbers + of samples in class 1, an error is raised. + + Hence the total missing rate is ``0.02 + 0.03 + 0.05 = 0.1``. + + If ``missing_distribution`` is a dict of vectors (and scalars):: + + {0: 0.1, + 3: [0.1, 0.15, 0.15]} + + Note that the shape of the vector must be ``(n_features,)`` + + There are ``0.1 / n_features * n_samples`` randomly chosen samples, + for each feature, of class 1 having a missing value. + + There are 0 samples of class 1 and 2 having missing value in any + feature. + + And There are ``0.1 * n_samples`` randomly chosen samples having + missing value in feature 0, ``0.15 * n_samples`` randomly chosen + samples having missing value in feature 1 and feature 2, each. + + A few notes: + + Note that the samples are randomly chosen "*for each feature*". + + The global ``missing_rate`` (fraction of missing values in the entire + dataset) is calculated as the sum of all the individual probabilities. + + At the end of transform, the dataset will contain a total of + ``(X.shape[0] * X.shape[1]) * missing_rate`` numbers of missing values. + + copy : bool, default False + Whether to copy the data or work inplace. + + random_state : int, optional + The seed for the numpy's random number generator. + + If ``random_state`` is set to an integer, the ``missing_distribution`` + can be scaled uniformly with the assumption that all the values + dropped with a smaller scale will exist in the larger scaled version:: + missing_distribution_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} + missing_distribution_50pc = {0: 0.1, 3: [0.1, 0.15, 0.15]} + + The missing values dropped with ``missing_distribution_25pc`` will also + exist in ``missing_distribution_50pc``. + + This guarantee does not apply when relative probabilities + within the ``missing_distribution`` change between two settings:: + missing_distribution_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} + # The below dist. is not a scaled version of above + missing_distribution_50pc = {0: 0.09, 3: [0.11, 0.15, 0.15]} + + + Examples + -------- + + >>> import numpy as np + >>> X = np.array([[0., 1., 2.], + ... [3., 4., 5.], + ... [6., 7., 8.], + ... [9., 0., 1.], + ... [2., 3., 4.], + ... [8., 9., 8.], + ... [1., 0., 5.], + ... [7., 8., 9.], + ... [5., 4., 3.], + ... [2., 1., 1.], + ... [1., 2., 3.]]) + >>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + >>> abs_missing_rate = 0.1 + >>> # Drop values across features 0 and 1 in + >>> # the ratio of 4:1 from samples with class label as 1 + >>> # The final fraction values that will be missing + >>> missing_distribution = {1: [0.8 * abs_missing_rate, + ... 0.2 * abs_missing_rate, + ... 0]} + >>> vd = ValueDropper(missing_distribution=missing_distribution, + ... random_state=0) + >>> vd.transform(X, y) + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., 8.], + [ 9., 0., 1.], + [ 2., 3., 4.], + [ 8., 9., 8.], + [ 1., 0., 5.], + [ nan, 8., 9.], + [ 5., 4., 3.], + [ 2., 1., 1.], + [ nan, 2., 3.]]) + >>> # Upscale the missing_distribution to add more missing values + >>> abs_missing_rate = 0.2 + >>> missing_distribution = {1: [0.8 * abs_missing_rate, + ... 0.2 * abs_missing_rate, + ... 0]} + >>> vd = ValueDropper(missing_distribution=missing_distribution, + ... random_state=0) + >>> vd.transform(X, y) + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., 8.], + [ 9., 0., 1.], + [ 2., 3., 4.], + [ nan, 9., 8.], + [ nan, nan, 5.], + [ nan, 8., 9.], + [ nan, 4., 3.], + [ 2., 1., 1.], + [ nan, 2., 3.]]) + >>> # MCAR missingness + >>> vd = ValueDropper(missing_distribution=abs_missing_rate, + ... random_state=0) + >>> vd.transform(X, y) + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ 6., 7., nan], + [ 9., 0., 1.], + [ nan, 3., 4.], + [ 8., 9., 8.], + [ 1., 0., nan], + [ 7., 8., 9.], + [ 5., 4., 3.], + [ nan, nan, 1.], + [ 1., nan, 3.]]) + >>> # Upscale the missing_distribution to add more missing values + >>> # Explicitly set copy=False for inplace dropping of values + >>> vd = ValueDropper(missing_distribution=2 * abs_missing_rate, + ... copy=False, random_state=0) + >>> _ = vd.transform(X, y) + >>> X + array([[ 0., 1., 2.], + [ 3., 4., 5.], + [ nan, 7., nan], + [ 9., nan, 1.], + [ nan, 3., 4.], + [ 8., nan, 8.], + [ 1., 0., nan], + [ 7., 8., nan], + [ 5., 4., 3.], + [ nan, nan, nan], + [ nan, nan, 3.]]) + """ + + def __init__(self, missing_values="NaN", + missing_distribution=None, copy=True, random_state=None): + self.missing_values = missing_values + self.missing_distribution = missing_distribution + self.copy = copy + self.random_state = random_state + + def transform(self, X, y=None): + """Drop values from ``X`` according to the given distribution. + + Parameters + ---------- + + X : ndarray like of shape (n_features, n_samples) + Data, in which the values must be dropped and set to + ``missing_values``. + + y : array-like, shape = (n_samples,), optional for MCAR + Target relative to X for classification or regression; + When missing_distribution is not a dict (for MCAR missingness), + ``y`` need not be passed. + """ + # Validate missing_values and generate missing_mask + if ((isinstance(self.missing_values, str) and + (self.missing_values.lower() == "nan")) or + np.isnan(self.missing_values)): + missing_values = np.nan + else: + missing_values = self.missing_values + + # Don't allow pre-exising missing values in X, to simplify API + X = check_array(X, dtype=('numeric' + if isinstance(missing_values, + (numbers.Integral, np.integer)) + else np.float), + copy=self.copy) + + n_samples, n_features = X.shape + n_values = n_samples * n_features + + rng = check_random_state(self.random_state) + + # Validate y, and find type of missingness + if isinstance(self.missing_distribution, dict): + missing_type = 'nmar' + # For NMAR + # Validate and convert the missing_distribution dict into a + # 2D probability distribution along the features and labels + + if y is None: + raise ValueError("The missing_distribution is a dict " + "but y is None. If missingness is to be " + "related to the class labels, target class " + "labels (y) must be passed.") + + target_type = type_of_target(y) + if 'continuous' in target_type or 'multioutput' in target_type: + raise ValueError("Value dropping based on the given " + "distribution can be done only for single " + "target which is discrete (classification " + "tasks). The given target (y) is of type %s" + % target_type) + y = check_array(y, ensure_2d=False, dtype='numeric') + + le = LabelEncoder().fit(y) + classes = le.classes_ + n_classes = classes.shape[0] + + drop_probs = np.zeros((n_classes, n_features), dtype=np.float64) + + for class_key, val in self.missing_distribution.items(): + # This will also validate incorrect values for class_key + encoded_class_key = le.transform([class_key, ])[0] + + if isinstance(val, (np.floating, float)): + drop_probs[encoded_class_key, :] = ( + val / float(n_features)) + elif isinstance(val, (np.ndarray, list, tuple)): + val = np.asarray(val) + if val.shape[0] != n_features: + raise ValueError("The shape of the per feature" + " drop probabilities vector " + "for label, %s, does not conform" + " to the number of features, %d" + % (class_key, n_features)) + drop_probs[encoded_class_key, :] = val + else: + raise ValueError("If missing_distribution is a dict with" + " target labels as keys, the values of " + "the dict should either be a single float" + " or an array of shape (n_features,). " + "%r was passed for class label %s" + % (val, class_key)) + + else: + missing_type = 'mcar' + + # For MCAR + # Validate and convert the missing_distribution dict into a + # 1D probability distribution along the features + + drop_probs = np.zeros((1, n_features), dtype=np.float64) + + if isinstance(self.missing_distribution, (float, np.floating)): + drop_probs[:] = self.missing_distribution / n_features + elif isinstance(self.missing_distribution, + (list, tuple, np.ndarray)): + missing_distribution = np.asarray(self.missing_distribution) + if missing_distribution.shape[0] != n_features: + raise ValueError("The shape of the per feature " + "drop probabilities vector does not " + "conform to the number of features, %d" + % n_features) + drop_probs[:] = self.missing_distribution + else: + raise ValueError("missing_distribution must be a float or " + " 1D vector (list, tuple or np.ndarray) of " + "shape (n_features,) or dict of 1D vector / " + "floats. %r was passed" + % self.missing_distribution) + + if 1 - drop_probs.ravel().sum() <= np.finfo(float).eps: + raise ValueError("The sum of all probabilities in the " + "missing_distribution should sum up to less " + "than 1. The sum was found to be %0.8f" + % drop_probs.ravel().sum()) + + drop_counts = (drop_probs * n_values).astype(int) + + if missing_type == 'mcar': + self._block_drop_missing_values(X, Ellipsis, 0, drop_counts, rng, + missing_type, missing_values) + else: + # For classification, NMAR, consider missing distribution + # based on class labels as subsets within data + for i, class_i in enumerate(classes): + self._block_drop_missing_values(X, y == class_i, i, + drop_counts, rng, + missing_type, missing_values) + return X + + def _block_drop_missing_values(self, X, samples_mask, encoded_label, + drop_counts, rng, missing_type, + missing_values): + """Helper to insert missing values in given block (label)""" + n_features = X.shape[1] + this_block_indices = np.arange(X.shape[0])[samples_mask] + for feature in range(n_features): + this_n_values = X[samples_mask].shape[0] + this_required_n_missing = drop_counts[encoded_label, feature] + + if this_required_n_missing <= 0: + continue + + if this_required_n_missing > this_n_values: + raise ValueError("There are no more available values at " + "%sfeature - %s, to drop." + # For NMAR, specify the label too + % ("label - %s, " % encoded_label + if missing_type == 'nmar' + else "", feature)) + + # Shuffle and pick this_required_n_missing indices for dropping + picked_indices = rng.permutation( + this_block_indices)[:this_required_n_missing] + + # Drop them + X[picked_indices, feature] = missing_values + return X From ff6f14cd55a5f51698ea2f01fc12074c2969b541 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 21 Nov 2016 13:53:20 +0100 Subject: [PATCH 02/16] Flake8 --- sklearn/datasets/tests/test_value_dropper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index ad020a3600908..4f017623a43bd 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -42,12 +42,12 @@ def test_value_dropper_mnar_clf(): float(n_values), 0.1) # All the missing values are from y == 0 - assert_almost_equal(np.isnan(X_dropped[y == classes[0]]).ravel().sum() / - float(n_values), 0.1) + assert_almost_equal(np.isnan( + X_dropped[y == classes[0]]).ravel().sum() / float(n_values), 0.1) # and no missing values from y != 0 - assert_almost_equal(np.isnan(X_dropped[y != classes[0]]).ravel().sum() / - float(n_values), 0.) + assert_almost_equal(np.isnan( + X_dropped[y != classes[0]]).ravel().sum() / float(n_values), 0.) # Samples from class 0 will have a drop probabilty of 0.3 # but spread unevenly across features as given by the From 613e491813b7670bc4865210b87e9413a99f9340 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 29 Nov 2016 17:31:27 +0100 Subject: [PATCH 03/16] ENH use drop-probabilites instead of missing fractions --- examples/datasets/generate_missing_values.py | 98 +++---- sklearn/datasets/tests/test_value_dropper.py | 179 +++++------- sklearn/datasets/value_dropper.py | 282 ++++++++----------- 3 files changed, 224 insertions(+), 335 deletions(-) diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py index 26e104bb22eec..819b57d79807a 100644 --- a/examples/datasets/generate_missing_values.py +++ b/examples/datasets/generate_missing_values.py @@ -8,10 +8,10 @@ given distribution. The :class`sklearn.datasets.ValueDropper` is a transformer which can be -initialized with a ``missing_distribution`` specifying the drop probabilites +initialized with a ``missing_proba`` specifying the drop probabilites for each label (and each feature if needed). It provisions preserving the -missing values of lower scaled ``missing_distribution`` in a higher scaled -``missing_distribution``. This facilitates benchmarking missing-value +missing values of lower scaled ``missing_proba`` in a higher scaled +``missing_proba``. This facilitates benchmarking missing-value strategies and evaluating the performance of such strategies with respect to the type and extent of missingness in data. @@ -42,49 +42,35 @@ print(__doc__) -X = np.array([[0, 1, 2], - [3, 4, 5], - [6, 7, 8], - [9, 0, 1], - [2, 3, 4], - [8, 9, 8], - [8, 9, 8], - [1, 0, 5], - [5, 4, 3], - [2, 1, 1], - [3, 4, 5], - [2, 3, 4], - [8, 9, 8], - [7, 8, 9]], dtype=float) -y = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2]) - -# Drop 10% of values across all features, where all missing values -# come from samples of class 1 - -vd = ValueDropper(missing_distribution={1: 0.1}, random_state=42) +X = np.random.RandomState(0).random_sample((20, 3)) +y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2]) + +# 10% of values in samples of class 1 will have missing values across all +# features +vd = ValueDropper(missing_proba={1: 0.2}, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping 10% of values when class label(s) are 1") +print("\nAfter dropping 10% of values in samples of class 1 across all" + " features") print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop another 10% of values across all features, where all missing values -# come from samples of class 1 - -vd = ValueDropper(missing_distribution={1: 0.2}, random_state=42) +# Drop another 10% of values in samples of class 1 across all features +vd = ValueDropper(missing_proba={1: 0.4}, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another 10% of values when class label(s) are 1") +print("\nAfter dropping another 10% of values in samples of class 1 across all" + " features") print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop 30% of values completely at random +# Drop 30% of values in all features completely at random -vd = ValueDropper(missing_distribution=0.3, random_state=42) +vd = ValueDropper(missing_proba=0.3, random_state=0) X_dropped = vd.transform(X, y) print("\nAfter dropping 30% of values randomly") @@ -93,50 +79,40 @@ for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop 30% of values but according to the given probability distribution -# Incrementally adding 10% each time - -# 40% of the dropped values must be from class label 0 -# (evenly across all features) -# The rest 60% of the dropped values are from class label 1, distributed in the -# 1:2:0 ratio amongst the features. -# Don't drop any values from samples of class 2 -abs_missing_rate = 0.1 -missing_distribution = {0: 0.4 * abs_missing_rate, - 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} - -# Also let's use -1 to denote missing values, this time -vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, - random_state=42) +# Drop values based on the given probability distribution + +# For samples of class 0, drop 10% of values (evenly across all features) +# For samples of class 1, drop 20% of values in feature 0, 40% in feature 1 +# and None in feature 2 +# Don't drop any values for samples of class 2. +missing_proba = {0: 0.1, 1: [0.2, 0.4, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("The given class wise distribution is %s " % missing_distribution) -print("\nAfter dropping 10% of values according to the distribution") +print("The given class wise missing_proba dict is %s " % missing_proba) +print("\nAfter dropping one set of missing values based on the " + " missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# NOTE that the relative values of the distribution must not be changed. -abs_missing_rate = 0.2 -missing_distribution = {0: 0.4 * abs_missing_rate, - 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} -vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, - random_state=42) +# Drop twice as many missing values as in previous step. +missing_proba = {0: 0.2, 1: [0.4, 0.6, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another 10% of values according to the distribution") +print("\nAfter dropping another set of missing values based on the new" + " missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -abs_missing_rate = 0.3 -missing_distribution = {0: 0.3 * abs_missing_rate, - 1: np.array([0.2, 0.4, 0]) * abs_missing_rate} -vd = ValueDropper(missing_values=-1, missing_distribution=missing_distribution, - random_state=42) +missing_proba = {0: 0.3, 1: [0.6, 0.8, 0]} +vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another 10% of values according to the distribution") +print("\nAfter dropping another set of missing values based on the new" + " missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 4f017623a43bd..7a4d147a6ba04 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -32,74 +32,59 @@ def test_value_dropper_mnar_clf(): # Inplace dropping of values - # Samples from class 0 will have a drop probability of 0.1 - vd = ValueDropper(missing_distribution={classes[0]: 0.1}, + # Samples from class 0 will have a drop-probability of 0.1 + vd = ValueDropper(missing_proba={classes[0]: 0.1}, missing_values=np.nan, random_state=0) X_dropped = vd.transform(X, y) + missing_mask = np.isnan(X_dropped) - # Check the total drop fraction - assert_almost_equal(np.isnan(X_dropped).ravel().sum() / - float(n_values), 0.1) + # Check the drop-probabilty for class 0 + assert_almost_equal(missing_mask[y == classes[0]].sum() / + (np.sum(y == classes[0]) * n_features), 0.1, + decimal=2) # All the missing values are from y == 0 - assert_almost_equal(np.isnan( - X_dropped[y == classes[0]]).ravel().sum() / float(n_values), 0.1) + assert_almost_equal( + np.isnan(X_dropped[y == classes[0]]).ravel().sum() / + (np.sum(y == classes[0]) * n_features), 0.1, decimal=2) # and no missing values from y != 0 - assert_almost_equal(np.isnan( - X_dropped[y != classes[0]]).ravel().sum() / float(n_values), 0.) + assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0) - # Samples from class 0 will have a drop probabilty of 0.3 + # Samples from class 1 will have a drop probabilty of 0.5 # but spread unevenly across features as given by the # list of probabilities - # And samples from class 1 will have a drop probability of 0.01 - # across all features - - missing_distribution = {classes[0]: [0.01, 0.005, 0.005, 0, 0], - classes[1]: 0.05} - vd = ValueDropper(missing_distribution=missing_distribution, - missing_values=np.nan, - random_state=0) + # And samples from class 0 will have a drop-probabilities as specified + # by a list of drop-probabilites for each feature + missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], + classes[1]: 0.5} + vd = ValueDropper(missing_proba=missing_proba, + missing_values=np.nan, random_state=0) X_dropped = vd.transform(X, y) missing_mask = np.isnan(X_dropped) - # Check that there are no missing values when y != {0 or 1} assert_equal(missing_mask[(y == classes[2])].ravel().sum(), 0) assert_equal(missing_mask[(y == classes[3])].ravel().sum(), 0) - # Check that the drop probabilites when class == 1 is 0.1 + # Check that the drop probabilites for samples of class 1 is 0.5 # across all features - assert_equal(missing_mask[y == classes[1]].ravel().sum() / - float(n_values), 0.05) - - # Check that the drop probabilites when class == 0 is 2.1 - # across all features sum(missing_distribution[0]) - assert_equal(missing_mask[y == classes[0]].ravel().sum() / - float(n_values), 0.02) - - # Check that the features indexed 3 and 4 have no missing values - # for class 0 - assert_equal(missing_mask[y == classes[0]][3, 4].ravel().sum(), 0.) - - # Check that feature indexed 0 has drop prob 0.2 - assert_equal(missing_mask[np.where(y == classes[0])[0], - (0,)].ravel().sum() / - float(n_values), 0.01) - - # Check that feature indexed 1 and 2 both have drop prob 0.05 - # Check that feature indexed 0 has drop prob 0.2 - assert_equal(missing_mask[np.where(y == classes[0])[0], - (1,)].ravel().sum() / - float(n_values), 0.005) - assert_equal(missing_mask[np.where(y == classes[0])[0], - (2,)].ravel().sum() / - float(n_values), 0.005) - - # Ensure scaling the missing_distribution by a factor of 2 - missing_distribution = {classes[0]: [0.02, 0.01, 0.01, 0, 0], - classes[1]: 0.1} - vd = ValueDropper(missing_distribution=missing_distribution, + assert_array_almost_equal( + missing_mask[y == classes[1]].sum(axis=0) / + np.sum(y == classes[1]), [0.5] * n_features, decimal=2) + + # Check that the drop probabilites when class == 0 are as given by + # the missing_proba dict + assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) / + np.sum(y == classes[0]), + missing_proba[classes[0]], + decimal=2) + + # Ensure scaling up the missing_proba retains previously dropped + # locations as long as random_state is set + # The up scaling need not be linear + missing_proba = {classes[0]: [0.1, 0.5, 0.5, 0.1, 0], classes[1]: 0.8} + vd = ValueDropper(missing_proba=missing_proba, missing_values=np.nan, random_state=0) X_dropped2 = vd.transform(X, y) assert_true(np.all(np.isnan(X_dropped2[np.isnan(X_dropped)]))) @@ -112,7 +97,7 @@ def test_value_dropper_mnar_reg_error(): "only for single target which is discrete" " (classification tasks). The given target (y) is of " "type continuous", - ValueDropper(missing_distribution={0: 0.2}).transform, + ValueDropper(missing_proba={0: 0.2}).transform, X, y) @@ -123,49 +108,46 @@ def check_value_dropper_mcar(X, y): n_values = n_samples * n_features # Inplace dropping of values; 0 correlation case. - # For even indexed features missing probability is 0.03 and - # for odd indexed ones 0.01 - missing_distribution = np.array([0.03, 0.01] * 5) - vd = ValueDropper(missing_distribution=missing_distribution, - copy=False, random_state=0) + # For even indexed features missing drop-probability is 0.3 and + # for odd indexed ones 0.1 + # (Also check if inplace operation works as expected) + missing_proba = np.array([0.3, 0.1] * 5) + vd = ValueDropper(missing_proba=missing_proba, copy=False, random_state=0) vd.transform(X_copy, y) missing_mask = np.isnan(X_copy) - global_missing_rate = missing_distribution.sum() + global_missing_rate = missing_proba.mean() # 0.2 # Check the global missing rate assert_almost_equal(missing_mask.ravel().sum() / float(n_values), global_missing_rate) # Check the rate for all even indexed features - assert_almost_equal(missing_mask[:, missing_distribution == 0.03] - .ravel().sum() / float(n_values), - 0.03 * 5) - - # Check the rate for one even indexed feature - assert_almost_equal(missing_mask[:, 0] - .ravel().sum() / float(n_values), 0.03) + even_feature_missing_mask = missing_mask[:, missing_proba == 0.3] + assert_almost_equal(even_feature_missing_mask.ravel().sum() / + even_feature_missing_mask.size, 0.3) # Check the rate for all odd features - assert_almost_equal(missing_mask[:, missing_distribution == 0.03] - .ravel().sum() / float(n_values), - 0.03 * 5) - - # Check the rate for one odd indexed feature - assert_almost_equal(missing_mask[:, 1] - .ravel().sum() / float(n_values), 0.01) + odd_feature_missing_mask = missing_mask[:, missing_proba == 0.1] + assert_almost_equal(odd_feature_missing_mask.ravel().sum() / + odd_feature_missing_mask.size, 0.1) # Let us drop 0.3 more fraction of values. This time not inplace # copy=True must be default - vd = ValueDropper(missing_distribution=0.6, random_state=0) + vd = ValueDropper(missing_proba=0.6, random_state=0) X_more_dropped = vd.transform(X_copy2, y) new_missing_mask = np.isnan(X_more_dropped) + # Check global drop probability + assert_almost_equal(new_missing_mask.ravel().sum() / n_values, 0.6) + # Check the drop-probability for a random feature 3 + assert_almost_equal(new_missing_mask[:, 3].ravel().sum() / n_samples, 0.6) + # Ensure X is not modified assert_array_almost_equal(X_copy2, X) # Ensure all the missing positions that were in the previous step also - # exist when missing_distribution is scaled up + # exist when missing_proba is scaled up # (Important for reproducibility) assert_true(np.all(new_missing_mask[missing_mask])) @@ -197,33 +179,21 @@ def test_value_dropper_errors(): n_repeated=0, random_state=0) - # Raise sensible error when sum of all probabilites in missing_distribution - # exceeds or equals 1 - missing_distributions = ( + # Raise sensible error when any probability is outside the range [0, 1] + missing_probas = ( # NMAR cases - {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2., }, - {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, 0.01], 1: 0.25, 2: [0.025, ] * 10, - 3: 0.25}, {0: [0.1] * 10, }, {0: 0.26, 1: [0.09, ] * 10}, + {0: 2., 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2, }, {0: -2, }, {0: 2.0, }, + {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01],}, # MCAR cases - [0, 0, 0, 0.2, 0.3, 0.1, 0, 0, 0, 0.5], 2.5, 1.5, - [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) - for missing_distribution in missing_distributions: - assert_raise_message(ValueError, "should sum up to less than 1", + [0, 0, 0, 0.2, 0.3, -0.1, 0, 0, 0, 0.5], 2.5, 1.5, + [0, -1, 0, 0, 0, 0, 0, 0, 0, 0], 2, -2) + for missing_proba in missing_probas: + assert_raise_message(ValueError, + "should be within the range of [0, 1]", ValueDropper( - missing_distribution=missing_distribution) - .transform, X, y) - - # Accept only float values 0, 1, 2 all are incorrect values even as float - # hence no point in accepting any int values - assert_raise_message(ValueError, - "missing_distribution must be a float or 1D vector", - ValueDropper(missing_distribution=2).transform, X, y) - assert_raise_message(ValueError, - "should either be a single float or an array", - ValueDropper(missing_distribution={0: 2}) - .transform, X, y) + missing_proba=missing_proba).transform, X, y) - wrong_missing_distributions_err_pairs = ( + wrong_missing_probas_err_pairs = ( # 1D vector with fewer or more than n_feature elements ([0.01, ] * 9, "does not conform to the number of features, 10"), ([0.01, ] * 11, "does not conform to the number of features, 10"), @@ -245,18 +215,15 @@ def test_value_dropper_errors(): "\{1: 0.2.*\} was passed for class label 0"), ("foobar", - "must be a float or 1D vector \(list, tuple or np.ndarray\)" + "must be a float or 1D vector \(list, tuple or np.ndarray\)" " of shape \(n_features,\) or dict")) - missing_distribution = {0: 0.025, 1: 0.025, 2: 0.025, 3: 0.025} - for missing_distribution, err_msg in wrong_missing_distributions_err_pairs: + for missing_proba, err_msg in wrong_missing_probas_err_pairs: assert_raises_regexp(ValueError, err_msg, - ValueDropper( - missing_distribution=missing_distribution) + ValueDropper(missing_proba=missing_proba) .transform, X, y) - # When missing_distribution is a dict, but y is not given - assert_raise_message(ValueError, "", - ValueDropper( - missing_distribution=missing_distribution) - .transform, X) + # When missing_proba is a dict, but y is not given + missing_proba = {0: 0.025} + assert_raise_message( + ValueError, "", ValueDropper(missing_proba=missing_proba).transform, X) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index e04e499a6c2d7..7296ba2366238 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -28,8 +28,8 @@ class ValueDropper(TransformerMixin): MCAR - Missing Completely At Random When the missingness is completely random (and hence uninformative). - If the missing type is NMAR, a ``missing_distribution`` - parameter can be passed to drop values conforming to that distribution. + If the missing type is NMAR, a ``missing_proba`` parameter can be passed + to drop values conforming to the given drop-probabilities. Parameters @@ -38,99 +38,62 @@ class ValueDropper(TransformerMixin): missing_values : {"NaN" (or np.nan) | int}, default "NaN" The value to insert to indicate missingness. + missing_proba : dict of floats or dict of vector of floats + If ``missing_proba`` is a float within range [0, 1), it represents the + probability with which the values will be dropped. - missing_distribution : dict of floats or dict of vector of floats - If ``missing_distribution`` is a float within range [0, 1), - it represents the absolute fraction of values that will be missing:: - 0.2 + The values are dropped (approximately) uniformly across all labels and + features. This type of missingness is referred to as MCAR. - There will be ``0.2 * n_samples * n_features`` numbers of missing - values in the data. + To vary the proportion of values dropped across each feature, + individual drop-probabilities for each feature can be specified as a 1D + vector of shape ``(n_features,)``. - Alternatively this refers to the probability of a value being dropped - after transform. The values are dropped (approximately) uniformly - across all labels and features. This type of missingness is referred - to as MCAR. + If missingness is not MCAR, ``missing_proba`` can be used to specify + the drop-probabilities on a per-label (and if needed further on + per-feature basis.). - To vary the distribution across features or to prevent a feature from - having missing values, individual probabilities for each feature can - be specified as a 1D vector of shape ``(n_features,)``:: - [0.2, 0.2, 0] + If ``missing_proba`` is a dict of floats:: + {1: 0.2, 2: 0.3, 3: 0.5} - For the above example, the probability that a sample will have missing - value in feature 0 is 0.2. In other words, after calling - ``transform``, there are ``0.2 * n_samples`` randomly selected samples - with value missing in feature 0 and 1 but no samples with - missing values in feature 2. + This represents, the drop-probabilities for samples of each + class-label. The missing values are evenly spread across all the + features. - If missingness is not MCAR, ``missing_distribution`` can be used - to specify the multinomial distribution of the newly dropped values - across labels (and if needed across features) as given below. - - If ``missing_distribution`` is a dict of floats:: - {1: 0.02, 2: 0.03, 3: 0.05} - - The probability that a sample from class 1 will have a missing value - is 0.02. - - The missing values are evenly spread across all the features. - - In other words, there are ``int(0.02 / n_features * n_samples)`` - randomly chosen samples of class 1 having missing values in each - feature. - - If there are fewer than ``int(0.02 / n_features * n_samples)`` numbers - of samples in class 1, an error is raised. - - Hence the total missing rate is ``0.02 + 0.03 + 0.05 = 0.1``. - - If ``missing_distribution`` is a dict of vectors (and scalars):: + If ``missing_proba`` is a dict of vectors (and scalars):: {0: 0.1, - 3: [0.1, 0.15, 0.15]} + 3: [0.1, 0.15]} Note that the shape of the vector must be ``(n_features,)`` - There are ``0.1 / n_features * n_samples`` randomly chosen samples, - for each feature, of class 1 having a missing value. - - There are 0 samples of class 1 and 2 having missing value in any - feature. - - And There are ``0.1 * n_samples`` randomly chosen samples having - missing value in feature 0, ``0.15 * n_samples`` randomly chosen - samples having missing value in feature 1 and feature 2, each. - - A few notes: + Samples from class 0 are dropped with probability of 0.1 for each + feature and those from class 3 are dropped with a probability of 0.1 + in feature 0, 0.15 in feature 1 while there are no values dropped from + samples of class 1 and 2. Note that the samples are randomly chosen "*for each feature*". - The global ``missing_rate`` (fraction of missing values in the entire - dataset) is calculated as the sum of all the individual probabilities. - - At the end of transform, the dataset will contain a total of - ``(X.shape[0] * X.shape[1]) * missing_rate`` numbers of missing values. - copy : bool, default False Whether to copy the data or work inplace. random_state : int, optional The seed for the numpy's random number generator. - If ``random_state`` is set to an integer, the ``missing_distribution`` + If ``random_state`` is set to an integer, the ``missing_proba`` can be scaled uniformly with the assumption that all the values dropped with a smaller scale will exist in the larger scaled version:: - missing_distribution_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} - missing_distribution_50pc = {0: 0.1, 3: [0.1, 0.15, 0.15]} + missing_proba_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} + missing_proba_50pc = {0: 0.1, 3: [0.1, 0.15, 0.15]} - The missing values dropped with ``missing_distribution_25pc`` will also - exist in ``missing_distribution_50pc``. + The missing values dropped with ``missing_proba_25pc`` will also + exist in ``missing_proba_50pc``. This guarantee does not apply when relative probabilities - within the ``missing_distribution`` change between two settings:: - missing_distribution_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} + within the ``missing_proba`` change between two settings:: + missing_proba_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} # The below dist. is not a scaled version of above - missing_distribution_50pc = {0: 0.09, 3: [0.11, 0.15, 0.15]} + missing_proba_50pc = {0: 0.09, 3: [0.11, 0.15, 0.15]} Examples @@ -153,11 +116,10 @@ class ValueDropper(TransformerMixin): >>> # Drop values across features 0 and 1 in >>> # the ratio of 4:1 from samples with class label as 1 >>> # The final fraction values that will be missing - >>> missing_distribution = {1: [0.8 * abs_missing_rate, - ... 0.2 * abs_missing_rate, - ... 0]} - >>> vd = ValueDropper(missing_distribution=missing_distribution, - ... random_state=0) + >>> missing_proba = {1: [0.8 * abs_missing_rate, + ... 0.2 * abs_missing_rate, + ... 0]} + >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) >>> vd.transform(X, y) array([[ 0., 1., 2.], [ 3., 4., 5.], @@ -170,13 +132,12 @@ class ValueDropper(TransformerMixin): [ 5., 4., 3.], [ 2., 1., 1.], [ nan, 2., 3.]]) - >>> # Upscale the missing_distribution to add more missing values + >>> # Upscale the missing_proba to add more missing values >>> abs_missing_rate = 0.2 - >>> missing_distribution = {1: [0.8 * abs_missing_rate, + >>> missing_proba = {1: [0.8 * abs_missing_rate, ... 0.2 * abs_missing_rate, ... 0]} - >>> vd = ValueDropper(missing_distribution=missing_distribution, - ... random_state=0) + >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) >>> vd.transform(X, y) array([[ 0., 1., 2.], [ 3., 4., 5.], @@ -190,8 +151,7 @@ class ValueDropper(TransformerMixin): [ 2., 1., 1.], [ nan, 2., 3.]]) >>> # MCAR missingness - >>> vd = ValueDropper(missing_distribution=abs_missing_rate, - ... random_state=0) + >>> vd = ValueDropper(missing_proba=abs_missing_rate, random_state=0) >>> vd.transform(X, y) array([[ 0., 1., 2.], [ 3., 4., 5.], @@ -204,9 +164,9 @@ class ValueDropper(TransformerMixin): [ 5., 4., 3.], [ nan, nan, 1.], [ 1., nan, 3.]]) - >>> # Upscale the missing_distribution to add more missing values + >>> # Upscale the missing_proba to add more missing values >>> # Explicitly set copy=False for inplace dropping of values - >>> vd = ValueDropper(missing_distribution=2 * abs_missing_rate, + >>> vd = ValueDropper(missing_proba=2 * abs_missing_rate, ... copy=False, random_state=0) >>> _ = vd.transform(X, y) >>> X @@ -224,9 +184,9 @@ class ValueDropper(TransformerMixin): """ def __init__(self, missing_values="NaN", - missing_distribution=None, copy=True, random_state=None): + missing_proba=None, copy=True, random_state=None): self.missing_values = missing_values - self.missing_distribution = missing_distribution + self.missing_proba = missing_proba self.copy = copy self.random_state = random_state @@ -242,7 +202,7 @@ def transform(self, X, y=None): y : array-like, shape = (n_samples,), optional for MCAR Target relative to X for classification or regression; - When missing_distribution is not a dict (for MCAR missingness), + When missing_proba is not a dict (for MCAR missingness), ``y`` need not be passed. """ # Validate missing_values and generate missing_mask @@ -261,19 +221,17 @@ def transform(self, X, y=None): copy=self.copy) n_samples, n_features = X.shape - n_values = n_samples * n_features - rng = check_random_state(self.random_state) # Validate y, and find type of missingness - if isinstance(self.missing_distribution, dict): + if isinstance(self.missing_proba, dict): missing_type = 'nmar' # For NMAR - # Validate and convert the missing_distribution dict into a + # Validate and convert the missing_proba dict into a # 2D probability distribution along the features and labels if y is None: - raise ValueError("The missing_distribution is a dict " + raise ValueError("The missing_proba is a dict " "but y is None. If missingness is to be " "related to the class labels, target class " "labels (y) must be passed.") @@ -293,102 +251,90 @@ def transform(self, X, y=None): drop_probs = np.zeros((n_classes, n_features), dtype=np.float64) - for class_key, val in self.missing_distribution.items(): + for class_key, val in self.missing_proba.items(): # This will also validate incorrect values for class_key encoded_class_key = le.transform([class_key, ])[0] - if isinstance(val, (np.floating, float)): - drop_probs[encoded_class_key, :] = ( - val / float(n_features)) - elif isinstance(val, (np.ndarray, list, tuple)): + if isinstance(val, (np.ndarray, list, tuple)): val = np.asarray(val) if val.shape[0] != n_features: - raise ValueError("The shape of the per feature" - " drop probabilities vector " - "for label, %s, does not conform" - " to the number of features, %d" + raise ValueError("The shape of the per feature " + "drop-probabilities vector " + "for label, %s, does not conform " + "to the number of features, %d" % (class_key, n_features)) - drop_probs[encoded_class_key, :] = val - else: - raise ValueError("If missing_distribution is a dict with" - " target labels as keys, the values of " - "the dict should either be a single float" - " or an array of shape (n_features,). " - "%r was passed for class label %s" - % (val, class_key)) + elif not isinstance(val, (np.floating, float, + numbers.Integral, np.integer)): + raise ValueError("If missing_proba is a dict with " + "target labels as keys, the values of " + "the dict should either be a single " + "float or an array of shape " + "(n_features,). %r was passed for class " + "label %s" % (val, class_key)) + + drop_probs[encoded_class_key, :] = val else: missing_type = 'mcar' - # For MCAR - # Validate and convert the missing_distribution dict into a + # Validate and convert the missing_proba dict into a # 1D probability distribution along the features drop_probs = np.zeros((1, n_features), dtype=np.float64) - if isinstance(self.missing_distribution, (float, np.floating)): - drop_probs[:] = self.missing_distribution / n_features - elif isinstance(self.missing_distribution, - (list, tuple, np.ndarray)): - missing_distribution = np.asarray(self.missing_distribution) - if missing_distribution.shape[0] != n_features: + if isinstance(self.missing_proba, (list, tuple, np.ndarray)): + # Convert to ndarray and check shape + missing_proba = np.asarray(self.missing_proba) + if missing_proba.shape[0] != n_features: raise ValueError("The shape of the per feature " - "drop probabilities vector does not " + "drop-probabilities vector does not " "conform to the number of features, %d" % n_features) - drop_probs[:] = self.missing_distribution - else: - raise ValueError("missing_distribution must be a float or " - " 1D vector (list, tuple or np.ndarray) of " + elif not isinstance(self.missing_proba, + (np.floating, float, numbers.Integral, + np.integer)): + raise ValueError("missing_proba must be a float or " + "1D vector (list, tuple or np.ndarray) of " "shape (n_features,) or dict of 1D vector / " "floats. %r was passed" - % self.missing_distribution) - - if 1 - drop_probs.ravel().sum() <= np.finfo(float).eps: - raise ValueError("The sum of all probabilities in the " - "missing_distribution should sum up to less " - "than 1. The sum was found to be %0.8f" - % drop_probs.ravel().sum()) - - drop_counts = (drop_probs * n_values).astype(int) - - if missing_type == 'mcar': - self._block_drop_missing_values(X, Ellipsis, 0, drop_counts, rng, - missing_type, missing_values) - else: - # For classification, NMAR, consider missing distribution - # based on class labels as subsets within data - for i, class_i in enumerate(classes): - self._block_drop_missing_values(X, y == class_i, i, - drop_counts, rng, - missing_type, missing_values) - return X + % self.missing_proba) + + drop_probs[:] = self.missing_proba + # Hack to simplify code + classes = [0, ] + y = np.zeros(n_samples) + + if np.any(drop_probs < 0) or np.any(drop_probs > 1): + raise ValueError("All the individual drop-probabilities should be " + "within the range of [0, 1]. The given " + "missing_proba does not conform to that. %r" + % self.missing_proba) + + for i, class_i in enumerate(classes): + samples_mask = (y == class_i) + this_n_samples = samples_mask.sum() + this_block_indices = np.arange(n_samples)[samples_mask] + + for feature in range(n_features): + # Shuffle even if this_required_n_missing is 0, to maintain + # consistency in generated missing values for successively + # increasing % of missing values.% + shuffled_indices = rng.permutation(this_block_indices) + this_required_n_missing = int(round(drop_probs[i, feature] * + this_n_samples)) + if this_required_n_missing == 0: + continue + + if this_required_n_missing > this_n_samples: + raise ValueError("There are no more available values at " + "%sfeature - %s, to drop." + # For NMAR, specify the label too + % ("class label - %s, " % class_i + if missing_type == 'nmar' else "", + feature)) + + # Drop them + X[shuffled_indices[:this_required_n_missing], + feature] = missing_values - def _block_drop_missing_values(self, X, samples_mask, encoded_label, - drop_counts, rng, missing_type, - missing_values): - """Helper to insert missing values in given block (label)""" - n_features = X.shape[1] - this_block_indices = np.arange(X.shape[0])[samples_mask] - for feature in range(n_features): - this_n_values = X[samples_mask].shape[0] - this_required_n_missing = drop_counts[encoded_label, feature] - - if this_required_n_missing <= 0: - continue - - if this_required_n_missing > this_n_values: - raise ValueError("There are no more available values at " - "%sfeature - %s, to drop." - # For NMAR, specify the label too - % ("label - %s, " % encoded_label - if missing_type == 'nmar' - else "", feature)) - - # Shuffle and pick this_required_n_missing indices for dropping - picked_indices = rng.permutation( - this_block_indices)[:this_required_n_missing] - - # Drop them - X[picked_indices, feature] = missing_values return X From d201a28c2493256c59dd44de9cae335491784129 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 29 Nov 2016 17:37:58 +0100 Subject: [PATCH 04/16] DOC edit whatsnew to reflect recent changes --- doc/whats_new.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 920983b2dca3a..5491c18e2a7ca 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -36,8 +36,10 @@ New features :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. - Introduced the :class:`datasets.ValueDropper` transformer to artificially - introduce missing values conforming to a given distribution (NMAR) or - completely at random (MCAR). :issue:`7084` by `Raghav RV`_. + introduce missing values based on per-class or per-feature + drop-probabilities (for introducing NMAR missingness) or global + drop-probability (for introducing MCAR missingness). + :issue:`7084` by `Raghav RV`_. Enhancements ............ From 1c4eabfcb9171100c560fc9a5520e01c690a82ee Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 29 Nov 2016 18:07:37 +0100 Subject: [PATCH 05/16] Modify and fix minor inconsistencies in the example --- examples/datasets/generate_missing_values.py | 74 ++++++++++---------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py index 819b57d79807a..554063d325d47 100644 --- a/examples/datasets/generate_missing_values.py +++ b/examples/datasets/generate_missing_values.py @@ -4,31 +4,29 @@ ============================================= This example illustrates how the :class:`sklearn.datasets.ValueDropper` can -be used to generate missing values completely at random or conforming to the -given distribution. +be used to generate missing values completely at random or based on the +given drop-probabilities. The :class`sklearn.datasets.ValueDropper` is a transformer which can be -initialized with a ``missing_proba`` specifying the drop probabilites -for each label (and each feature if needed). It provisions preserving the -missing values of lower scaled ``missing_proba`` in a higher scaled -``missing_proba``. This facilitates benchmarking missing-value -strategies and evaluating the performance of such strategies with -respect to the type and extent of missingness in data. - -It allows benchmarking with incremental missing rates (fraction of missing -values to total number of values) without introducing a mismatch in the -missing positions for previous lower rates of missing values. +initialized with a ``missing_proba`` specifying the drop-probabilites +for each class label (and each feature if needed). This facilitates +benchmarking missing-value strategies and evaluating the performance of such +strategies with respect to the type, extent and distribution of missingness in +the data. Importantly, when ``random_state`` is set to an integer, it +provisions preserving the drop-locations as the ``missing_proba`` is upscaled +to study the effect of the increase in missingness. This allows benchmarking +with incremental missing rates without causing variation in the results due to +an inconsistency in the drop-locations between different scales of +``missing_proba``. NMAR or Not Missing At Random refers to the case when the missingness in the data is distributed not at random. It is either correlated with the target -value(s) or with the data itself. +value(s) or with the data itself. In some references it is also refered to as +MNAR or Missing Not At Random. MCAR or Missing Completely At Random refers to the case when the missingness in the data is completely random and does not correlate with the classification target value(s) or the data. - -In some references NMAR is sometimes referred to as MNAR (Missing Not At -Random). """ # Author: Raghav RV # @@ -45,74 +43,74 @@ X = np.random.RandomState(0).random_sample((20, 3)) y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2]) -# 10% of values in samples of class 1 will have missing values across all -# features +# Each feature of samples of class 1 will have 20% of thier values missing. vd = ValueDropper(missing_proba={1: 0.2}, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping 10% of values in samples of class 1 across all" - " features") +print("\nAfter dropping 20% of values (per feature) in samples of class 1:") print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop another 10% of values in samples of class 1 across all features +# Each feature of samples of class 1 will have another 20% of thier values +# missing. (Old locations will be preserved as random_state is set) vd = ValueDropper(missing_proba={1: 0.4}, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another 10% of values in samples of class 1 across all" - " features") +print("\nAfter dropping another 20% of values (per feature) in samples of " + "class 1:") print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop 30% of values in all features completely at random +# Drop 30% of values in each feature completely at random vd = ValueDropper(missing_proba=0.3, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping 30% of values randomly") +print("\nAfter dropping 30% of values randomly:") print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Drop values based on the given probability distribution +# Drop values based on the given drop-probabilities - -# For samples of class 0, drop 10% of values (evenly across all features) -# For samples of class 1, drop 20% of values in feature 0, 40% in feature 1 +# For samples of class 0, drop 10% of values (in each feature) +# For samples of class 2, drop 20% of values in feature 0, 40% in feature 1 # and None in feature 2 -# Don't drop any values for samples of class 2. -missing_proba = {0: 0.1, 1: [0.2, 0.4, 0]} +# Don't drop any values for samples of class 1. +missing_proba = {0: 0.1, 2: [0.2, 0.4, 0]} vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("The given class wise missing_proba dict is %s " % missing_proba) print("\nAfter dropping one set of missing values based on the " - " missing_proba=%s" % missing_proba) + "missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") # Drop twice as many missing values as in previous step. -missing_proba = {0: 0.2, 1: [0.4, 0.6, 0]} +missing_proba = {0: 0.2, 2: [0.4, 0.6, 0]} vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another set of missing values based on the new" - " missing_proba=%s" % missing_proba) +print("\nAfter dropping another set of missing values based on the new " + "missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -missing_proba = {0: 0.3, 1: [0.6, 0.8, 0]} +# Drop more values and also drop 40% of values from samples of class 1 +# (in each feature) +missing_proba = {0: 0.3, 1: 0.4, 2: [0.6, 0.8, 0]} vd = ValueDropper(missing_proba=missing_proba, random_state=0) X_dropped = vd.transform(X, y) -print("\nAfter dropping another set of missing values based on the new" - " missing_proba=%s" % missing_proba) +print("\nAfter dropping another set of missing values based on the new " + "missing_proba=%s" % missing_proba) print("y", "X", sep="\t") print("------------------------") for i in range(y.shape[0]): From deabc723078f4203b9112ee7ae9120fe948c6ecb Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 29 Nov 2016 21:48:13 +0100 Subject: [PATCH 06/16] Fix tests and doc --- sklearn/datasets/tests/test_value_dropper.py | 17 ++-- sklearn/datasets/value_dropper.py | 89 +++++++++----------- 2 files changed, 50 insertions(+), 56 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 7a4d147a6ba04..529d1216e0f68 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -40,13 +40,13 @@ def test_value_dropper_mnar_clf(): # Check the drop-probabilty for class 0 assert_almost_equal(missing_mask[y == classes[0]].sum() / - (np.sum(y == classes[0]) * n_features), 0.1, + float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2) # All the missing values are from y == 0 assert_almost_equal( np.isnan(X_dropped[y == classes[0]]).ravel().sum() / - (np.sum(y == classes[0]) * n_features), 0.1, decimal=2) + float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2) # and no missing values from y != 0 assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0) @@ -71,12 +71,12 @@ def test_value_dropper_mnar_clf(): # across all features assert_array_almost_equal( missing_mask[y == classes[1]].sum(axis=0) / - np.sum(y == classes[1]), [0.5] * n_features, decimal=2) + float(np.sum(y == classes[1])), [0.5] * n_features, decimal=2) # Check that the drop probabilites when class == 0 are as given by # the missing_proba dict assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) / - np.sum(y == classes[0]), + float(np.sum(y == classes[0])), missing_proba[classes[0]], decimal=2) @@ -125,12 +125,12 @@ def check_value_dropper_mcar(X, y): # Check the rate for all even indexed features even_feature_missing_mask = missing_mask[:, missing_proba == 0.3] assert_almost_equal(even_feature_missing_mask.ravel().sum() / - even_feature_missing_mask.size, 0.3) + float(even_feature_missing_mask.size), 0.3) # Check the rate for all odd features odd_feature_missing_mask = missing_mask[:, missing_proba == 0.1] assert_almost_equal(odd_feature_missing_mask.ravel().sum() / - odd_feature_missing_mask.size, 0.1) + float(odd_feature_missing_mask.size), 0.1) # Let us drop 0.3 more fraction of values. This time not inplace # copy=True must be default @@ -139,9 +139,10 @@ def check_value_dropper_mcar(X, y): new_missing_mask = np.isnan(X_more_dropped) # Check global drop probability - assert_almost_equal(new_missing_mask.ravel().sum() / n_values, 0.6) + assert_almost_equal(new_missing_mask.ravel().sum() / float(n_values), 0.6) # Check the drop-probability for a random feature 3 - assert_almost_equal(new_missing_mask[:, 3].ravel().sum() / n_samples, 0.6) + assert_almost_equal(new_missing_mask[:, 3].ravel().sum() / + float(n_samples), 0.6) # Ensure X is not modified assert_array_almost_equal(X_copy2, X) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index 7296ba2366238..5446f49034206 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -81,20 +81,14 @@ class ValueDropper(TransformerMixin): The seed for the numpy's random number generator. If ``random_state`` is set to an integer, the ``missing_proba`` - can be scaled uniformly with the assumption that all the values + can be upscaled safely with the assumption that all the values dropped with a smaller scale will exist in the larger scaled version:: - missing_proba_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} - missing_proba_50pc = {0: 0.1, 3: [0.1, 0.15, 0.15]} + missing_proba_25pc = {0: 0.1, 3: [0.3, 0.1, 0.1]} + missing_proba_50pc = {0: 0.1, 1:0.2, 3: [0.6, 0.1, 0.8]} The missing values dropped with ``missing_proba_25pc`` will also exist in ``missing_proba_50pc``. - This guarantee does not apply when relative probabilities - within the ``missing_proba`` change between two settings:: - missing_proba_25pc = {0: 0.05, 3: [0.05, 0.075, 0.075]} - # The below dist. is not a scaled version of above - missing_proba_50pc = {0: 0.09, 3: [0.11, 0.15, 0.15]} - Examples -------- @@ -112,13 +106,12 @@ class ValueDropper(TransformerMixin): ... [2., 1., 1.], ... [1., 2., 3.]]) >>> y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - >>> abs_missing_rate = 0.1 - >>> # Drop values across features 0 and 1 in - >>> # the ratio of 4:1 from samples with class label as 1 - >>> # The final fraction values that will be missing - >>> missing_proba = {1: [0.8 * abs_missing_rate, - ... 0.2 * abs_missing_rate, - ... 0]} + >>> # NMAR missingness - + >>> # Drop values from samples of class 1 alone based on the below + >>> # missing_proba hence making it Not Missing At Random missingness. + >>> missing_proba = {1: [0.2, # Drop 20% values from feature 0 for class 0 + ... 0.2, # and class 1 + ... 0]} # Do not drop any values from feature 2 >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) >>> vd.transform(X, y) array([[ 0., 1., 2.], @@ -128,58 +121,58 @@ class ValueDropper(TransformerMixin): [ 2., 3., 4.], [ 8., 9., 8.], [ 1., 0., 5.], - [ nan, 8., 9.], - [ 5., 4., 3.], + [ 7., 8., 9.], + [ nan, 4., 3.], [ 2., 1., 1.], - [ nan, 2., 3.]]) - >>> # Upscale the missing_proba to add more missing values - >>> abs_missing_rate = 0.2 - >>> missing_proba = {1: [0.8 * abs_missing_rate, - ... 0.2 * abs_missing_rate, - ... 0]} + [ 1., nan, 3.]]) + >>> # Upscale the missing_proba to add more missing values in feature 0 + >>> # Also add a few missing values in all features for class 0 samples. + >>> missing_proba = {1: [0.4, 0.2, 0], 0: 0.6} >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) >>> vd.transform(X, y) - array([[ 0., 1., 2.], - [ 3., 4., 5.], - [ 6., 7., 8.], - [ 9., 0., 1.], - [ 2., 3., 4.], - [ nan, 9., 8.], - [ nan, nan, 5.], - [ nan, 8., 9.], + array([[ nan, nan, 2.], + [ nan, nan, nan], + [ nan, nan, 8.], + [ 9., 0., nan], + [ 2., 3., nan], + [ 8., 9., 8.], + [ 1., 0., 5.], + [ 7., 8., 9.], [ nan, 4., 3.], [ 2., 1., 1.], - [ nan, 2., 3.]]) - >>> # MCAR missingness - >>> vd = ValueDropper(missing_proba=abs_missing_rate, random_state=0) + [ nan, nan, 3.]]) + >>> # MCAR missingness - + >>> # 30% of values in each feature Missing Completely At Random + >>> vd = ValueDropper(missing_proba=0.3, random_state=0) >>> vd.transform(X, y) array([[ 0., 1., 2.], [ 3., 4., 5.], - [ 6., 7., nan], - [ 9., 0., 1.], + [ nan, 7., nan], + [ 9., nan, 1.], [ nan, 3., 4.], [ 8., 9., 8.], [ 1., 0., nan], - [ 7., 8., 9.], + [ 7., 8., nan], [ 5., 4., 3.], [ nan, nan, 1.], [ 1., nan, 3.]]) - >>> # Upscale the missing_proba to add more missing values + >>> # Upscale the missing_proba to add more missing values in feature 0 and + >>> # 1 alone. Retain the same drop-probability for feature 2 >>> # Explicitly set copy=False for inplace dropping of values - >>> vd = ValueDropper(missing_proba=2 * abs_missing_rate, + >>> vd = ValueDropper(missing_proba=[0.6, 0.8, 0.3], ... copy=False, random_state=0) >>> _ = vd.transform(X, y) >>> X - array([[ 0., 1., 2.], - [ 3., 4., 5.], - [ nan, 7., nan], + array([[ 0., nan, 2.], + [ nan, nan, 5.], + [ nan, nan, nan], [ 9., nan, 1.], - [ nan, 3., 4.], + [ nan, nan, 4.], [ 8., nan, 8.], - [ 1., 0., nan], - [ 7., 8., nan], - [ 5., 4., 3.], - [ nan, nan, nan], + [ nan, 0., nan], + [ nan, 8., nan], + [ 5., nan, 3.], + [ nan, nan, 1.], [ nan, nan, 3.]]) """ From ca3b2e6fdb9296096f9306b5f66c9aa255665292 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Tue, 29 Nov 2016 22:18:31 +0100 Subject: [PATCH 07/16] Fake8 -_- --- sklearn/datasets/tests/test_value_dropper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 529d1216e0f68..5c417457f12a4 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -15,7 +15,6 @@ def test_value_dropper_mnar_clf(): # Test drop probabilites when missing distribution is # given for classification problems n_samples, n_features = 1000, 5 - n_values = n_samples * n_features X, y = make_classification(n_samples=n_samples, n_classes=4, n_features=n_features, @@ -184,7 +183,7 @@ def test_value_dropper_errors(): missing_probas = ( # NMAR cases {0: 2., 1: 0.25, 2: 0.25, 3: 0.25}, {0: 2, }, {0: -2, }, {0: 2.0, }, - {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01],}, + {0: [0, 0, 0, 0, 0.24, 0, 0, 0, 0, -0.01], }, # MCAR cases [0, 0, 0, 0.2, 0.3, -0.1, 0, 0, 0, 0.5], 2.5, 1.5, [0, -1, 0, 0, 0, 0, 0, 0, 0, 0], 2, -2) From 7e6c90179fadfa084d4a65bacff2246c105db401 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 30 Nov 2016 14:18:07 +0100 Subject: [PATCH 08/16] Allow floats (w/TST); Simplify doc; Remove redundant checks --- sklearn/datasets/tests/test_value_dropper.py | 10 ++-- sklearn/datasets/value_dropper.py | 55 ++++---------------- 2 files changed, 17 insertions(+), 48 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 5c417457f12a4..55e28d1329b52 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -84,9 +84,10 @@ def test_value_dropper_mnar_clf(): # The up scaling need not be linear missing_proba = {classes[0]: [0.1, 0.5, 0.5, 0.1, 0], classes[1]: 0.8} vd = ValueDropper(missing_proba=missing_proba, - missing_values=np.nan, random_state=0) + missing_values=-100.2, random_state=0) X_dropped2 = vd.transform(X, y) - assert_true(np.all(np.isnan(X_dropped2[np.isnan(X_dropped)]))) + new_missing_mask = X_dropped2 == -100.2 + assert_true(np.all(new_missing_mask[missing_mask])) def test_value_dropper_mnar_reg_error(): @@ -133,9 +134,10 @@ def check_value_dropper_mcar(X, y): # Let us drop 0.3 more fraction of values. This time not inplace # copy=True must be default - vd = ValueDropper(missing_proba=0.6, random_state=0) + # Check with inf as missing values + vd = ValueDropper(missing_proba=0.6, missing_values=np.inf, random_state=0) X_more_dropped = vd.transform(X_copy2, y) - new_missing_mask = np.isnan(X_more_dropped) + new_missing_mask = np.isinf(X_more_dropped) # Check global drop probability assert_almost_equal(new_missing_mask.ravel().sum() / float(n_values), 0.6) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index 5446f49034206..83fca1c60cbac 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -35,44 +35,19 @@ class ValueDropper(TransformerMixin): Parameters ---------- - missing_values : {"NaN" (or np.nan) | int}, default "NaN" + missing_values : {"NaN" (or np.nan) | int | float}, default "NaN" The value to insert to indicate missingness. missing_proba : dict of floats or dict of vector of floats - If ``missing_proba`` is a float within range [0, 1), it represents the - probability with which the values will be dropped. - - The values are dropped (approximately) uniformly across all labels and - features. This type of missingness is referred to as MCAR. - To vary the proportion of values dropped across each feature, individual drop-probabilities for each feature can be specified as a 1D - vector of shape ``(n_features,)``. - - If missingness is not MCAR, ``missing_proba`` can be used to specify - the drop-probabilities on a per-label (and if needed further on - per-feature basis.). - - If ``missing_proba`` is a dict of floats:: - {1: 0.2, 2: 0.3, 3: 0.5} - - This represents, the drop-probabilities for samples of each - class-label. The missing values are evenly spread across all the - features. + array-like of shape (n_features, ) (e.g. [0.1, 0.15, 0.1]). - If ``missing_proba`` is a dict of vectors (and scalars):: + If missingness is not MCAR, a dict of floats can be used to specify + the drop-probabilities on a per-label basis (e.g. {1: 0.2, 2: 0.3, 3: 0.5}). - {0: 0.1, - 3: [0.1, 0.15]} - - Note that the shape of the vector must be ``(n_features,)`` - - Samples from class 0 are dropped with probability of 0.1 for each - feature and those from class 3 are dropped with a probability of 0.1 - in feature 0, 0.15 in feature 1 while there are no values dropped from - samples of class 1 and 2. - - Note that the samples are randomly chosen "*for each feature*". + This dict can also contains some 1D array-like of shape (n_features, ) + to vary drop-probabilities across features (e.g. {1: 0.1, 3: [0.1, 0.15, 0.1]}). copy : bool, default False Whether to copy the data or work inplace. @@ -83,11 +58,11 @@ class ValueDropper(TransformerMixin): If ``random_state`` is set to an integer, the ``missing_proba`` can be upscaled safely with the assumption that all the values dropped with a smaller scale will exist in the larger scaled version:: - missing_proba_25pc = {0: 0.1, 3: [0.3, 0.1, 0.1]} - missing_proba_50pc = {0: 0.1, 1:0.2, 3: [0.6, 0.1, 0.8]} + missing_proba_1 = {0: 0.1, 3: [0.3, 0.1, 0.1]} + missing_proba_2 = {0: 0.1, 1:0.2, 3: [0.6, 0.1, 0.8]} - The missing values dropped with ``missing_proba_25pc`` will also - exist in ``missing_proba_50pc``. + The missing values dropped with ``missing_proba_1`` will also + be dropped with ``missing_proba_2``. Examples @@ -189,7 +164,7 @@ def transform(self, X, y=None): Parameters ---------- - X : ndarray like of shape (n_features, n_samples) + X : array-like of shape (n_features, n_samples) Data, in which the values must be dropped and set to ``missing_values``. @@ -318,14 +293,6 @@ def transform(self, X, y=None): if this_required_n_missing == 0: continue - if this_required_n_missing > this_n_samples: - raise ValueError("There are no more available values at " - "%sfeature - %s, to drop." - # For NMAR, specify the label too - % ("class label - %s, " % class_i - if missing_type == 'nmar' else "", - feature)) - # Drop them X[shuffled_indices[:this_required_n_missing], feature] = missing_values From f78174a988755391730e188135da54f3d2220773 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 30 Nov 2016 14:27:53 +0100 Subject: [PATCH 09/16] Use missing mask instead of recomputing it; Add err msg --- sklearn/datasets/tests/test_value_dropper.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 55e28d1329b52..1f0d3cd15b167 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -42,23 +42,20 @@ def test_value_dropper_mnar_clf(): float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2) - # All the missing values are from y == 0 + # Check drop-probability for samples of class 0 assert_almost_equal( - np.isnan(X_dropped[y == classes[0]]).ravel().sum() / + missing_mask[y == classes[0]].ravel().sum() / float(np.sum(y == classes[0]) * n_features), 0.1, decimal=2) # and no missing values from y != 0 assert_equal(missing_mask[y != classes[0]].ravel().sum(), 0) - # Samples from class 1 will have a drop probabilty of 0.5 - # but spread unevenly across features as given by the - # list of probabilities - # And samples from class 0 will have a drop-probabilities as specified + # Samples from class 0 will have 50% of values missing in each feature + # And samples from class 1 will have a drop-probabilities as specified # by a list of drop-probabilites for each feature - missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], - classes[1]: 0.5} - vd = ValueDropper(missing_proba=missing_proba, - missing_values=np.nan, random_state=0) + missing_proba = {classes[0]: [0.1, 0.2, 0.2, 0, 0], classes[1]: 0.5} + vd = ValueDropper(missing_proba=missing_proba, missing_values=np.nan, + random_state=0) X_dropped = vd.transform(X, y) missing_mask = np.isnan(X_dropped) @@ -228,4 +225,5 @@ def test_value_dropper_errors(): # When missing_proba is a dict, but y is not given missing_proba = {0: 0.025} assert_raise_message( - ValueError, "", ValueDropper(missing_proba=missing_proba).transform, X) + ValueError, "The missing_proba is a dict but y is None.", + ValueDropper(missing_proba=missing_proba).transform, X) From 4c608fd017198347c96cc0c1b1f02ccd7dd3ca4d Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 30 Nov 2016 14:28:24 +0100 Subject: [PATCH 10/16] Cosmit --- sklearn/datasets/tests/test_value_dropper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 1f0d3cd15b167..6cccaa37837ab 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -69,7 +69,7 @@ def test_value_dropper_mnar_clf(): missing_mask[y == classes[1]].sum(axis=0) / float(np.sum(y == classes[1])), [0.5] * n_features, decimal=2) - # Check that the drop probabilites when class == 0 are as given by + # Check that the drop probabilites when class == 0 are as given by # the missing_proba dict assert_array_almost_equal(missing_mask[y == classes[0]].sum(axis=0) / float(np.sum(y == classes[0])), From 84a086be7e16d9c3b70073c452a92a1e3e4b7489 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Wed, 30 Nov 2016 15:10:39 +0100 Subject: [PATCH 11/16] flake8 --- sklearn/datasets/value_dropper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index 83fca1c60cbac..6a4f1b37c0936 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -44,10 +44,12 @@ class ValueDropper(TransformerMixin): array-like of shape (n_features, ) (e.g. [0.1, 0.15, 0.1]). If missingness is not MCAR, a dict of floats can be used to specify - the drop-probabilities on a per-label basis (e.g. {1: 0.2, 2: 0.3, 3: 0.5}). + the drop-probabilities on a per-label basis + (e.g. {1: 0.2, 2: 0.3, 3: 0.5}). This dict can also contains some 1D array-like of shape (n_features, ) - to vary drop-probabilities across features (e.g. {1: 0.1, 3: [0.1, 0.15, 0.1]}). + to vary drop-probabilities across features + (e.g. {1: 0.1, 3: [0.1, 0.15, 0.1]}). copy : bool, default False Whether to copy the data or work inplace. @@ -193,7 +195,6 @@ def transform(self, X, y=None): # Validate y, and find type of missingness if isinstance(self.missing_proba, dict): - missing_type = 'nmar' # For NMAR # Validate and convert the missing_proba dict into a # 2D probability distribution along the features and labels @@ -243,7 +244,6 @@ def transform(self, X, y=None): drop_probs[encoded_class_key, :] = val else: - missing_type = 'mcar' # For MCAR # Validate and convert the missing_proba dict into a # 1D probability distribution along the features From 339a55a5bd8b8a749f6f07ff560d62ae13fd98d0 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 14:37:24 +0100 Subject: [PATCH 12/16] Generate a matrix of random states to avoid shuffling unneeded indices for consistency --- sklearn/datasets/value_dropper.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index 6a4f1b37c0936..78475ad732ade 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -278,21 +278,26 @@ def transform(self, X, y=None): "missing_proba does not conform to that. %r" % self.missing_proba) + # Generate random_states for each feature / label in advance + # This is important to maintain consistency in generated missing values + # for successively increasing missing percent. + random_states = rng.randint(0, np.iinfo(np.int32).max, + drop_probs.shape) + for i, class_i in enumerate(classes): samples_mask = (y == class_i) this_n_samples = samples_mask.sum() this_block_indices = np.arange(n_samples)[samples_mask] for feature in range(n_features): - # Shuffle even if this_required_n_missing is 0, to maintain - # consistency in generated missing values for successively - # increasing % of missing values.% - shuffled_indices = rng.permutation(this_block_indices) this_required_n_missing = int(round(drop_probs[i, feature] * this_n_samples)) if this_required_n_missing == 0: continue + this_rng = check_random_state(random_state[i, feature]) + shuffled_indices = this_rng.permutation(this_block_indices) + # Drop them X[shuffled_indices[:this_required_n_missing], feature] = missing_values From 84ff832f151501692a4bd7e31946afae43583ef5 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 14:38:47 +0100 Subject: [PATCH 13/16] cosmits --- examples/datasets/generate_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py index 554063d325d47..954b325851933 100644 --- a/examples/datasets/generate_missing_values.py +++ b/examples/datasets/generate_missing_values.py @@ -43,7 +43,7 @@ X = np.random.RandomState(0).random_sample((20, 3)) y = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2]) -# Each feature of samples of class 1 will have 20% of thier values missing. +# For samples from class 1, each feature will be missing 20% of its values vd = ValueDropper(missing_proba={1: 0.2}, random_state=0) X_dropped = vd.transform(X, y) @@ -53,7 +53,7 @@ for i in range(y.shape[0]): print(y[i], X_dropped[i], sep="\t") -# Each feature of samples of class 1 will have another 20% of thier values +# Each feature of samples of class 1 will have a further 20% of its values # missing. (Old locations will be preserved as random_state is set) vd = ValueDropper(missing_proba={1: 0.4}, random_state=0) X_dropped = vd.transform(X, y) From 2d03e8e2bfc6f0cc6f9b27b6e4ae5ac79cf7e02f Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 15:13:55 +0100 Subject: [PATCH 14/16] Attempt unifying MCAR/MNAR missing_probs generation --- sklearn/datasets/tests/test_value_dropper.py | 17 +++-- sklearn/datasets/value_dropper.py | 76 +++++++++----------- 2 files changed, 43 insertions(+), 50 deletions(-) diff --git a/sklearn/datasets/tests/test_value_dropper.py b/sklearn/datasets/tests/test_value_dropper.py index 6cccaa37837ab..2b5f231f16306 100644 --- a/sklearn/datasets/tests/test_value_dropper.py +++ b/sklearn/datasets/tests/test_value_dropper.py @@ -199,23 +199,26 @@ def test_value_dropper_errors(): # Dict with labels having fewer or more than n_feature elements ({1: [0.01, ] * 9, }, - "for label, 1, does not conform to the number of features, 10"), + "For label, 1, the shape of the per feature drop-probabilities " + "vector does not conform to the number of features, 10"), - ({0: [0.01, ] * 10, 1: [0.01, ] * 11}, - "for label, 1, does not conform to the number of features, 10"), + ({0: [0.01, ] * 11, 1: [0.01, ] * 10}, + "For label, 0, the shape of the per feature drop-probabilities " + "vector does not conform to the number of features, 10"), # Dict having labels not present in y labels ({0: 0.025, 1: [0.0025, ] * 10, 2: 0.025, 3: 0.025, 4: 0.025}, "y contains new labels: \[4\]"), # Incorrect dict or incorrect value - ({0: {1: 0.2}, }, - "either be a single float or an array of shape \(n_features,\). " - "\{1: 0.2.*\} was passed for class label 0"), + ({0: 'foo', }, + "For label, 0, probability value must be a float or 1D vector \(list," + " tuple or np.ndarray\) of shape \(n_features,\) \'foo\' was passed"), ("foobar", "must be a float or 1D vector \(list, tuple or np.ndarray\)" - " of shape \(n_features,\) or dict")) + " of shape \(n_features,\) or dict of floats/1D vectors. " + "'foobar' was passed.")) for missing_proba, err_msg in wrong_missing_probas_err_pairs: assert_raises_regexp(ValueError, err_msg, diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index 78475ad732ade..a6467f57ba9da 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -198,6 +198,7 @@ def transform(self, X, y=None): # For NMAR # Validate and convert the missing_proba dict into a # 2D probability distribution along the features and labels + missing_type = 'nmar' if y is None: raise ValueError("The missing_proba is a dict " @@ -220,57 +221,46 @@ def transform(self, X, y=None): drop_probs = np.zeros((n_classes, n_features), dtype=np.float64) - for class_key, val in self.missing_proba.items(): - # This will also validate incorrect values for class_key - encoded_class_key = le.transform([class_key, ])[0] - - if isinstance(val, (np.ndarray, list, tuple)): - val = np.asarray(val) - if val.shape[0] != n_features: - raise ValueError("The shape of the per feature " - "drop-probabilities vector " - "for label, %s, does not conform " - "to the number of features, %d" - % (class_key, n_features)) - elif not isinstance(val, (np.floating, float, - numbers.Integral, np.integer)): - raise ValueError("If missing_proba is a dict with " - "target labels as keys, the values of " - "the dict should either be a single " - "float or an array of shape " - "(n_features,). %r was passed for class " - "label %s" % (val, class_key)) - - drop_probs[encoded_class_key, :] = val - + class_keys, probas = zip(*self.missing_proba.items()) + encoded_class_keys = le.transform(class_keys) else: # For MCAR # Validate and convert the missing_proba dict into a # 1D probability distribution along the features + missing_type = 'mcar' drop_probs = np.zeros((1, n_features), dtype=np.float64) - if isinstance(self.missing_proba, (list, tuple, np.ndarray)): - # Convert to ndarray and check shape - missing_proba = np.asarray(self.missing_proba) - if missing_proba.shape[0] != n_features: - raise ValueError("The shape of the per feature " - "drop-probabilities vector does not " - "conform to the number of features, %d" - % n_features) - elif not isinstance(self.missing_proba, - (np.floating, float, numbers.Integral, - np.integer)): - raise ValueError("missing_proba must be a float or " + # Hack to simplify and unify missing generation code for nmar/mcar + classes = class_keys = encoded_class_keys = (0, ) + probas = (self.missing_proba, ) + y = np.zeros(n_samples) + + # For both nmar/mcar + for encoded_class_key, class_key, proba in zip(encoded_class_keys, + class_keys, probas): + if isinstance(proba, (np.ndarray, list, tuple)): + proba = np.asarray(proba) + if proba.shape[0] != n_features: + raise ValueError("%s shape of the per feature " + "drop-probabilities vector " + "does not conform to the number of " + "features, %d" + % ("For label, %s, the" % class_key + if missing_type == 'nmar' + else "The", n_features)) + elif not isinstance(proba, (np.floating, float, + numbers.Integral, np.integer)): + raise ValueError("%s value must be a float or " "1D vector (list, tuple or np.ndarray) of " - "shape (n_features,) or dict of 1D vector / " - "floats. %r was passed" - % self.missing_proba) + "shape (n_features,)%s %r was passed." + % ("For label, %s, probability" % class_key + if missing_type == 'nmar' + else 'Probability', + " or dict of floats/1D vectors." + if missing_type == 'mcar' else "", proba)) - drop_probs[:] = self.missing_proba - # Hack to simplify code - classes = [0, ] - y = np.zeros(n_samples) + drop_probs[encoded_class_key, :] = proba if np.any(drop_probs < 0) or np.any(drop_probs > 1): raise ValueError("All the individual drop-probabilities should be " @@ -295,7 +285,7 @@ def transform(self, X, y=None): if this_required_n_missing == 0: continue - this_rng = check_random_state(random_state[i, feature]) + this_rng = check_random_state(random_states[i, feature]) shuffled_indices = this_rng.permutation(this_block_indices) # Drop them From eddaff5a30761061475ff7e0d37f61eae3ed6f03 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 15:16:01 +0100 Subject: [PATCH 15/16] COSMITS --- examples/datasets/generate_missing_values.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/datasets/generate_missing_values.py b/examples/datasets/generate_missing_values.py index 954b325851933..659d4844e24bf 100644 --- a/examples/datasets/generate_missing_values.py +++ b/examples/datasets/generate_missing_values.py @@ -1,7 +1,7 @@ """ -============================================= -Generating NMAR / MCAR missing_values in data -============================================= +================================================================ +Data Pertubation: Generating NMAR / MCAR missing_values in data +================================================================ This example illustrates how the :class:`sklearn.datasets.ValueDropper` can be used to generate missing values completely at random or based on the @@ -13,8 +13,8 @@ benchmarking missing-value strategies and evaluating the performance of such strategies with respect to the type, extent and distribution of missingness in the data. Importantly, when ``random_state`` is set to an integer, it -provisions preserving the drop-locations as the ``missing_proba`` is upscaled -to study the effect of the increase in missingness. This allows benchmarking +provisions preserving the drop-locations as the ``missing_proba`` is increased +to study the effect of the more missing values. This allows benchmarking with incremental missing rates without causing variation in the results due to an inconsistency in the drop-locations between different scales of ``missing_proba``. From d6c26231d8d945afc80ceaaf26a58ca2dd0fb991 Mon Sep 17 00:00:00 2001 From: Raghav RV Date: Mon, 5 Dec 2016 15:35:08 +0100 Subject: [PATCH 16/16] Fix doctest and flake8 --- sklearn/datasets/value_dropper.py | 68 +++++++++++++++---------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/sklearn/datasets/value_dropper.py b/sklearn/datasets/value_dropper.py index a6467f57ba9da..29ca635d131ad 100644 --- a/sklearn/datasets/value_dropper.py +++ b/sklearn/datasets/value_dropper.py @@ -96,59 +96,59 @@ class ValueDropper(TransformerMixin): [ 6., 7., 8.], [ 9., 0., 1.], [ 2., 3., 4.], - [ 8., 9., 8.], + [ 8., nan, 8.], [ 1., 0., 5.], - [ 7., 8., 9.], - [ nan, 4., 3.], + [ nan, 8., 9.], + [ 5., 4., 3.], [ 2., 1., 1.], - [ 1., nan, 3.]]) - >>> # Upscale the missing_proba to add more missing values in feature 0 + [ 1., 2., 3.]]) + >>> # Increase the missing_proba to add more missing values in feature 0 >>> # Also add a few missing values in all features for class 0 samples. >>> missing_proba = {1: [0.4, 0.2, 0], 0: 0.6} >>> vd = ValueDropper(missing_proba=missing_proba, random_state=0) >>> vd.transform(X, y) - array([[ nan, nan, 2.], - [ nan, nan, nan], + array([[ nan, nan, nan], + [ nan, 4., nan], [ nan, nan, 8.], - [ 9., 0., nan], - [ 2., 3., nan], - [ 8., 9., 8.], - [ 1., 0., 5.], - [ 7., 8., 9.], - [ nan, 4., 3.], + [ 9., 0., 1.], + [ 2., nan, nan], + [ 8., nan, 8.], + [ nan, 0., 5.], + [ nan, 8., 9.], + [ 5., 4., 3.], [ 2., 1., 1.], - [ nan, nan, 3.]]) + [ 1., 2., 3.]]) >>> # MCAR missingness - >>> # 30% of values in each feature Missing Completely At Random >>> vd = ValueDropper(missing_proba=0.3, random_state=0) >>> vd.transform(X, y) - array([[ 0., 1., 2.], + array([[ 0., 1., nan], [ 3., 4., 5.], - [ nan, 7., nan], - [ 9., nan, 1.], - [ nan, 3., 4.], - [ 8., 9., 8.], - [ 1., 0., nan], - [ 7., 8., nan], - [ 5., 4., 3.], - [ nan, nan, 1.], - [ 1., nan, 3.]]) - >>> # Upscale the missing_proba to add more missing values in feature 0 and - >>> # 1 alone. Retain the same drop-probability for feature 2 + [ 6., nan, 8.], + [ 9., 0., 1.], + [ 2., nan, 4.], + [ nan, 9., 8.], + [ nan, nan, 5.], + [ nan, 8., nan], + [ 5., 4., nan], + [ 2., 1., 1.], + [ 1., 2., 3.]]) + >>> # Increase the missing_proba to add more missing values in feature 0 + >>> # and 1 alone. Retain the same drop-probability for feature 2 >>> # Explicitly set copy=False for inplace dropping of values >>> vd = ValueDropper(missing_proba=[0.6, 0.8, 0.3], ... copy=False, random_state=0) >>> _ = vd.transform(X, y) >>> X - array([[ 0., nan, 2.], + array([[ 0., 1., nan], [ nan, nan, 5.], - [ nan, nan, nan], - [ 9., nan, 1.], + [ 6., nan, 8.], + [ 9., 0., 1.], [ nan, nan, 4.], - [ 8., nan, 8.], - [ nan, 0., nan], - [ nan, 8., nan], - [ 5., nan, 3.], + [ nan, nan, 8.], + [ nan, nan, 5.], + [ nan, nan, nan], + [ 5., nan, nan], [ nan, nan, 1.], [ nan, nan, 3.]]) """ @@ -250,7 +250,7 @@ def transform(self, X, y=None): if missing_type == 'nmar' else "The", n_features)) elif not isinstance(proba, (np.floating, float, - numbers.Integral, np.integer)): + numbers.Integral, np.integer)): raise ValueError("%s value must be a float or " "1D vector (list, tuple or np.ndarray) of " "shape (n_features,)%s %r was passed."