diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 0f0c5aaa15781..01dd03b0dd0c0 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -247,10 +247,38 @@ features::
     * :ref:`example_ensemble_plot_forest_importances_faces.py`: example
       on face recognition data.
 
+.. _mRMR:
+
+Minimum Redundancy Maximal Relevance (mRMR)
+===============================================
+
+This filter feature selector was proposed by Peng et al. in 2005. mRMR
+identifies a subset of features having maximal mutual information with the
+target (i.e. relevance), and minimal mutual information with each other (i.e.
+redundancy).
+
+The algorithm expects discretized features. Peng et al. suggest to use the mean
+and standard deviation of each feature for that purpose. For instance, divide
+a feature in three levels:
+
+    [-Inf, < mean - std]
+    [> mean - std, < mean + std]
+    [> mean + std, +Inf]
+
+:class:`MinRedundancyMaxRelevance`
+
+.. topic:: References:
+
+   * H. Peng, F. Long, C. Ding, "Feature selection based on Mutual Information:
+     Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy",
+     IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.27,
+     n.8 (2005)
+
+
 Feature selection as part of a pipeline
 =======================================
 
-Feature selection is usually used as a pre-processing step before doing 
+Feature selection is usually used as a pre-processing step before doing
 the actual learning. The recommended way to do this in scikit-learn is
 to use a :class:`sklearn.pipeline.Pipeline`::
 
@@ -260,10 +288,10 @@ to use a :class:`sklearn.pipeline.Pipeline`::
   ])
   clf.fit(X, y)
 
-In this snippet we make use of a :class:`sklearn.svm.LinearSVC` 
+In this snippet we make use of a :class:`sklearn.svm.LinearSVC`
 to evaluate feature importances and select the most relevant features.
-Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the 
-transformed output, i.e. using only relevant features. You can perform 
+Then, a class:`sklearn.ensemble.GradientBoostingClassifier` is trained on the
+transformed output, i.e. using only relevant features. You can perform
 similar operations with the other feature selection methods and also
-classifiers that provide a way to evaluate feature importances of course. 
+classifiers that provide a way to evaluate feature importances of course.
 See the :class:`sklearn.pipeline.Pipeline` examples for more details.
diff --git a/examples/plot_mRMR.py b/examples/plot_mRMR.py
new file mode 100644
index 0000000000000..f9acf5d7d62ef
--- /dev/null
+++ b/examples/plot_mRMR.py
@@ -0,0 +1,69 @@
+"""
+===========================================
+Minimum redundancy maximum relevance (mRMR)
+===========================================
+
+Mutual information is a metric assessing the degree of statistical independence
+between two random variables.
+
+mRMR feature selection consists in selecting a subset of the available features
+showing high mutual information with the target and low mutual information with
+each other.
+
+This example compares mRMR feature selection with Recursive feature elimination
+(RFE) and Univariate feature selection (Uni), taking advantage of a synthetic
+dataset.
+
+This dataset has 100 samples and 3 features: A, B and C, enabling to
+respectively classify 60%, 50% and 40% of the data.
+
+Let's assume the plan is to choose only 2 of those 3 features. Given that A
+and B have higher accuracy, we would expect a selection algorithm to pick those
+two. However, it turns out that A and B are redundant with each other (i.e.
+they are able to classify the same samples). Conversely, C has lower accuracy,
+but provides indepedent information respect to A and B.
+
+As expectable, mRMR selects feature A and C, while the other two selection
+algorithm select features A and B.
+
+
+.. note::
+
+    See also :ref:`example_plot_rfe_digits.py`,
+             :ref:`example_plot_feature_selection.py`
+
+"""
+print(__doc__)
+
+import numpy as np
+from sklearn.feature_selection import RFE, SelectKBest, chi2, \
+    MinRedundancyMaxRelevance
+from sklearn.linear_model import LogisticRegression
+
+
+#  Number of samples in the dataset
+N = 100
+
+#  Associating a class to each sample in the dataset
+y = np.array([0] * 50 + [1] * 50)
+
+#  Creating a feature able to classify 60% of the samples
+A = np.array([0] * 30 + [1] * 20 + [1] * 20 + [2] * 30)
+
+#  Creating a feature able to classify 50% of the samples
+B = np.array([2] * 25 + [1] * 25 + [1] * 25 + [0] * 25)
+
+#  Creating a feature able to classify 40% of the samples
+C = np.array([2] * 20 + [0] * 30 + [1] * 30 + [2] * 20)
+
+X = np.array([A, B, C]).T
+feature = ['A', 'B', 'C']
+
+# We will be using the following three selectors
+selectors = [('RFE', RFE(LogisticRegression(), 2)),
+             ('Uni', SelectKBest(chi2, k=2)),
+             ('mRMR', MinRedundancyMaxRelevance(k=2))]
+
+for name, selector in selectors:
+    k = selector.fit(X, y).get_support(True).tolist()
+    print name, 'selected %s and %s' % (feature[k[0]], feature[k[1]])
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
index 3d27638091995..cdad116b3dc4f 100644
--- a/sklearn/feature_selection/__init__.py
+++ b/sklearn/feature_selection/__init__.py
@@ -17,10 +17,13 @@
 
 from .variance_threshold import VarianceThreshold
 
+from .multivariate_filtering import MinRedundancyMaxRelevance
+
 from .rfe import RFE
 from .rfe import RFECV
 
 __all__ = ['GenericUnivariateSelect',
+           'MinRedundancyMaxRelevance',
            'RFE',
            'RFECV',
            'SelectFdr',
diff --git a/sklearn/feature_selection/multivariate_filtering.py b/sklearn/feature_selection/multivariate_filtering.py
new file mode 100644
index 0000000000000..8c6e601ba46f8
--- /dev/null
+++ b/sklearn/feature_selection/multivariate_filtering.py
@@ -0,0 +1,154 @@
+# Author: Andrea Bravi <a.bravi@uottawa.ca>
+# License: 3-clause BSD
+
+import numpy as np
+from ..base import BaseEstimator
+from .base import SelectorMixin
+from ..metrics.cluster.supervised import mutual_info_score
+from ..utils.validation import array2d
+
+
+class MinRedundancyMaxRelevance(BaseEstimator, SelectorMixin):
+    """
+    Select the subset of features with minimal redundancy and maximal
+    relevance (mRMR) with the outcome.
+
+    IMPORTANT: This version only supports data in categorical or integer form.
+
+    Attributes
+    ----------
+    k : int, default=2
+        Number of features to select (selected_features)
+    mask : list, len=selected_features
+           Integer list of the features ordered by maximal relevance and
+           minimal redundancy
+    score : array, shape=[selected_features]
+            mRMR score associated to each entry in mask
+    relevance : array, shape=[n_features]
+                Relevance of all the features
+    redundancy : array, shape=[n_features]
+                 Redundancy of all the features
+    rule : string, default='diff'
+           Rule to combine relevance and redundancy, either
+           'diff' - difference between the two
+           'prod' - product between the two
+    X : array, shape=[n_samples, n_features]
+        Input dataset, must be either integer or categorical
+    y : array, shape=[n_samples]
+        Label vector, must be either integer or categorical
+
+    References
+    ----------
+    .. [1] H. Peng, F. Long, and C. Ding, "Feature selection based on mutual
+       information: criteria of max-dependency, max-relevance, and
+       min-redundancy", IEEE Transactions on Pattern Analysis and Machine
+       Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005.
+    """
+    def __init__(self, k=2, rule='diff'):
+        """
+        Parameters
+        ----------
+        k : int, default=2
+            Number of features to select
+        rule : string, default='diff'
+               Rule to combine relevance and redundancy, either
+               'diff' - difference between the two
+               'prod' - product between the two
+        """
+        self.k = k
+        self.rule = rule
+
+    def fit(self, X, y):
+        """
+        Parameters
+        ----------
+        X : array, shape=[n_samples, n_features]
+            Input dataset, must be either integer or categorical
+        y : array, shape=[n_samples]
+            Label vector, must be either integer or categorical
+        """
+        X = array2d(X)
+
+        self.X = X
+        self.y = y
+        self.mask, self.score = self._compute_mRMR(X, y)
+        return self
+
+    def _get_support_mask(self):
+        """
+        Returns
+        -------
+        support : array, dype=bool, shape=[n_features]
+                  Boolean mask with True the selected features
+        """
+
+        support = np.zeros(self.n_features, dtype=bool)
+        support[self.mask] = True
+        return support
+
+    def _compute_mRMR(self, X, y):
+        """
+        Parameters
+        ----------
+        X : array, shape=[n_samples, n_features]
+            Input dataset, must be either integer or categorical
+        y : array, shape=[n_samples]
+            Label vector, must be either integer or categorical
+
+        Returns
+        -------
+        mask : list, len=selected_features
+               Integer list of the features ordered by maximal relevance and
+               minimal redundancy
+        score : list, len=selected_features
+                mRMR score associated to each entry in mask
+        """
+        M = X.shape[1]  # Number of features
+
+        # Computation of relevance and redundancy
+        relevance = np.zeros(M)
+        redundancy = np.zeros([M, M])
+        for m1 in range(0, M):
+            relevance[m1] = mutual_info_score(X[:, m1], y)
+            for m2 in range(m1+1, M):
+                redundancy[m1, m2] = mutual_info_score(X[:, m1],
+                                                       X[:, m2])
+                redundancy[m2, m1] = redundancy[m1, m2]
+
+        # Sequential search optimization
+        mask = []
+        score = []
+        search_space = range(0, M)
+
+        score.append(max(relevance))
+        ind = int(relevance.argmax(0))  # Optimal feature
+        mask.append(ind)
+        search_space.pop(ind)
+
+        if self.rule == 'diff':
+            for m in range(0, self.k-1):
+                tmp_score = relevance[search_space] - \
+                    np.mean(redundancy[:, search_space]
+                            .take(mask, axis=0), 0)
+                score.append(max(tmp_score))
+                ind = tmp_score.argmax(0)
+                mask.append(search_space[ind])
+                search_space.pop(ind)
+
+        elif self.rule == 'prod':
+            for m in range(0, self.k-1):
+                tmp_score = relevance[search_space] * \
+                    np.mean(redundancy[:, search_space]
+                            .take(mask, axis=0), 0)
+                score.append(max(tmp_score))
+                ind = tmp_score.argmax(0)
+                mask.append(search_space[ind])
+                search_space.pop(ind)
+        else:
+            raise ValueError("rule should be either 'diff' or 'prod'")
+
+        self.n_features = M
+        self.relevance = relevance
+        self.redundancy = redundancy
+
+        return mask, score
diff --git a/sklearn/feature_selection/tests/test_multivariate_filtering.py b/sklearn/feature_selection/tests/test_multivariate_filtering.py
new file mode 100644
index 0000000000000..173f7bb68bfb4
--- /dev/null
+++ b/sklearn/feature_selection/tests/test_multivariate_filtering.py
@@ -0,0 +1,31 @@
+from sklearn.utils.testing import (assert_array_equal, assert_raises)
+
+import numpy as np
+
+from sklearn.feature_selection import MinRedundancyMaxRelevance
+
+X = np.array([[1, 3, 1],
+              [3, 3, 3],
+              [1, 3, 1],
+              [1, 3, 3],
+              [1, 3, 1]])
+
+y = np.array([3, 1, 3, 1, 3])
+
+
+def test_mMRM():
+    """
+    Test MinRedundancyMaxRelevance with default setting.
+    """
+
+    m = MinRedundancyMaxRelevance().fit(X, y)
+
+    assert_array_equal([2, 0], m.mask)
+
+    assert_array_equal(0.6730116670092563, m.score[0])
+
+    m = MinRedundancyMaxRelevance(rule='prod').fit(X, y)
+
+    assert_array_equal(0.049793044493117354, m.score[1])
+
+    assert_raises(ValueError, MinRedundancyMaxRelevance(rule='none').fit, X, y)