diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst index 0f0c5aaa15781..01dd03b0dd0c0 100644 --- a/doc/modules/feature_selection.rst +++ b/doc/modules/feature_selection.rst @@ -247,10 +247,38 @@ features:: * :ref:`example_ensemble_plot_forest_importances_faces.py`: example on face recognition data. +.. _mRMR: + +Minimum Redundancy Maximal Relevance (mRMR) +=============================================== + +This filter feature selector was proposed by Peng et al. in 2005. mRMR +identifies a subset of features having maximal mutual information with the +target (i.e. relevance), and minimal mutual information with each other (i.e. +redundancy). + +The algorithm expects discretized features. Peng et al. suggest to use the mean +and standard deviation of each feature for that purpose. For instance, divide +a feature in three levels: + + [-Inf, < mean - std] + [> mean - std, < mean + std] + [> mean + std, +Inf] + +:class:`MinRedundancyMaxRelevance` + +.. topic:: References: + + * H. Peng, F. Long, C. Ding, "Feature selection based on Mutual Information: + Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy", + IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.27, + n.8 (2005) + + Feature selection as part of a pipeline ======================================= -Feature selection is usually used as a pre-processing step before doing +Feature selection is usually used as a pre-processing step before doing the actual learning. The recommended way to do this in scikit-learn is to use a :class:`sklearn.pipeline.Pipeline`:: @@ -260,10 +288,10 @@ to use a :class:`sklearn.pipeline.Pipeline`:: ]) clf.fit(X, y) -In this snippet we make use of a :class:`sklearn.svm.LinearSVC` +In this snippet we make use of a :class:`sklearn.svm.LinearSVC` to evaluate feature importances and select the most relevant features. -Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the -transformed output, i.e. using only relevant features. You can perform +Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the +transformed output, i.e. using only relevant features. You can perform similar operations with the other feature selection methods and also -classifiers that provide a way to evaluate feature importances of course. +classifiers that provide a way to evaluate feature importances of course. See the :class:`sklearn.pipeline.Pipeline` examples for more details. diff --git a/examples/plot_mRMR.py b/examples/plot_mRMR.py new file mode 100644 index 0000000000000..f9acf5d7d62ef --- /dev/null +++ b/examples/plot_mRMR.py @@ -0,0 +1,69 @@ +""" +=========================================== +Minimum redundancy maximum relevance (mRMR) +=========================================== + +Mutual information is a metric assessing the degree of statistical independence +between two random variables. + +mRMR feature selection consists in selecting a subset of the available features +showing high mutual information with the target and low mutual information with +each other. + +This example compares mRMR feature selection with Recursive feature elimination +(RFE) and Univariate feature selection (Uni), taking advantage of a synthetic +dataset. + +This dataset has 100 samples and 3 features: A, B and C, enabling to +respectively classify 60%, 50% and 40% of the data. + +Let's assume the plan is to choose only 2 of those 3 features. Given that A +and B have higher accuracy, we would expect a selection algorithm to pick those +two. However, it turns out that A and B are redundant with each other (i.e. +they are able to classify the same samples). Conversely, C has lower accuracy, +but provides indepedent information respect to A and B. + +As expectable, mRMR selects feature A and C, while the other two selection +algorithm select features A and B. + + +.. note:: + + See also :ref:`example_plot_rfe_digits.py`, + :ref:`example_plot_feature_selection.py` + +""" +print(__doc__) + +import numpy as np +from sklearn.feature_selection import RFE, SelectKBest, chi2, \ + MinRedundancyMaxRelevance +from sklearn.linear_model import LogisticRegression + + +# Number of samples in the dataset +N = 100 + +# Associating a class to each sample in the dataset +y = np.array([0] * 50 + [1] * 50) + +# Creating a feature able to classify 60% of the samples +A = np.array([0] * 30 + [1] * 20 + [1] * 20 + [2] * 30) + +# Creating a feature able to classify 50% of the samples +B = np.array([2] * 25 + [1] * 25 + [1] * 25 + [0] * 25) + +# Creating a feature able to classify 40% of the samples +C = np.array([2] * 20 + [0] * 30 + [1] * 30 + [2] * 20) + +X = np.array([A, B, C]).T +feature = ['A', 'B', 'C'] + +# We will be using the following three selectors +selectors = [('RFE', RFE(LogisticRegression(), 2)), + ('Uni', SelectKBest(chi2, k=2)), + ('mRMR', MinRedundancyMaxRelevance(k=2))] + +for name, selector in selectors: + k = selector.fit(X, y).get_support(True).tolist() + print name, 'selected %s and %s' % (feature[k[0]], feature[k[1]]) diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index 3d27638091995..cdad116b3dc4f 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -17,10 +17,13 @@ from .variance_threshold import VarianceThreshold +from .multivariate_filtering import MinRedundancyMaxRelevance + from .rfe import RFE from .rfe import RFECV __all__ = ['GenericUnivariateSelect', + 'MinRedundancyMaxRelevance', 'RFE', 'RFECV', 'SelectFdr', diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py new file mode 100644 index 0000000000000..8c6e601ba46f8 --- /dev/null +++ b/sklearn/feature_selection/multivariate_filtering.py @@ -0,0 +1,154 @@ +# Author: Andrea Bravi +# License: 3-clause BSD + +import numpy as np +from ..base import BaseEstimator +from .base import SelectorMixin +from ..metrics.cluster.supervised import mutual_info_score +from ..utils.validation import array2d + + +class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin): + """ + Select the subset of features with minimal redundancy and maximal + relevance (mRMR) with the outcome. + + IMPORTANT: This version only supports data in categorical or integer form. + + Attributes + ---------- + k : int, default=2 + Number of features to select (selected_features) + mask : list, len=selected_features + Integer list of the features ordered by maximal relevance and + minimal redundancy + score : array, shape=[selected_features] + mRMR score associated to each entry in mask + relevance : array, shape=[n_features] + Relevance of all the features + redundancy : array, shape=[n_features] + Redundancy of all the features + rule : string, default='diff' + Rule to combine relevance and redundancy, either + 'diff' - difference between the two + 'prod' - product between the two + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + + References + ---------- + .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual + information: criteria of max-dependency, max-relevance, and + min-redundancy", IEEE Transactions on Pattern Analysis and Machine + Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005. + """ + def __init__(self, k=2, rule='diff'): + """ + Parameters + ---------- + k : int, default=2 + Number of features to select + rule : string, default='diff' + Rule to combine relevance and redundancy, either + 'diff' - difference between the two + 'prod' - product between the two + """ + self.k = k + self.rule = rule + + def fit(self, X, y): + """ + Parameters + ---------- + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + """ + X = array2d(X) + + self.X = X + self.y = y + self.mask, self.score = self._compute_mRMR(X, y) + return self + + def _get_support_mask(self): + """ + Returns + ------- + support : array, dype=bool, shape=[n_features] + Boolean mask with True the selected features + """ + + support = np.zeros(self.n_features, dtype=bool) + support[self.mask] = True + return support + + def _compute_mRMR(self, X, y): + """ + Parameters + ---------- + X : array, shape=[n_samples, n_features] + Input dataset, must be either integer or categorical + y : array, shape=[n_samples] + Label vector, must be either integer or categorical + + Returns + ------- + mask : list, len=selected_features + Integer list of the features ordered by maximal relevance and + minimal redundancy + score : list, len=selected_features + mRMR score associated to each entry in mask + """ + M = X.shape[1] # Number of features + + # Computation of relevance and redundancy + relevance = np.zeros(M) + redundancy = np.zeros([M, M]) + for m1 in range(0, M): + relevance[m1] = mutual_info_score(X[:, m1], y) + for m2 in range(m1+1, M): + redundancy[m1, m2] = mutual_info_score(X[:, m1], + X[:, m2]) + redundancy[m2, m1] = redundancy[m1, m2] + + # Sequential search optimization + mask = [] + score = [] + search_space = range(0, M) + + score.append(max(relevance)) + ind = int(relevance.argmax(0)) # Optimal feature + mask.append(ind) + search_space.pop(ind) + + if self.rule == 'diff': + for m in range(0, self.k-1): + tmp_score = relevance[search_space] - \ + np.mean(redundancy[:, search_space] + .take(mask, axis=0), 0) + score.append(max(tmp_score)) + ind = tmp_score.argmax(0) + mask.append(search_space[ind]) + search_space.pop(ind) + + elif self.rule == 'prod': + for m in range(0, self.k-1): + tmp_score = relevance[search_space] * \ + np.mean(redundancy[:, search_space] + .take(mask, axis=0), 0) + score.append(max(tmp_score)) + ind = tmp_score.argmax(0) + mask.append(search_space[ind]) + search_space.pop(ind) + else: + raise ValueError("rule should be either 'diff' or 'prod'") + + self.n_features = M + self.relevance = relevance + self.redundancy = redundancy + + return mask, score diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py new file mode 100644 index 0000000000000..173f7bb68bfb4 --- /dev/null +++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py @@ -0,0 +1,31 @@ +from sklearn.utils.testing import (assert_array_equal, assert_raises) + +import numpy as np + +from sklearn.feature_selection import MinRedundancyMaxRelevance + +X = np.array([[1, 3, 1], + [3, 3, 3], + [1, 3, 1], + [1, 3, 3], + [1, 3, 1]]) + +y = np.array([3, 1, 3, 1, 3]) + + +def test_mMRM(): + """ + Test MinRedundancyMaxRelevance with default setting. + """ + + m = MinRedundancyMaxRelevance().fit(X, y) + + assert_array_equal([2, 0], m.mask) + + assert_array_equal(0.6730116670092563, m.score[0]) + + m = MinRedundancyMaxRelevance(rule='prod').fit(X, y) + + assert_array_equal(0.049793044493117354, m.score[1]) + + assert_raises(ValueError, MinRedundancyMaxRelevance(rule='none').fit, X, y)