diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 14969f2969713..5023564df7c55 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -534,6 +534,8 @@ From text
    feature_selection.chi2
    feature_selection.f_classif
    feature_selection.f_regression
+   feature_selection.mutual_info_classif
+   feature_selection.mutual_info_regression
 
 
 .. _gaussian_process_ref:
diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 60e4d0a38f7c8..8b7bfee654e77 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -67,8 +67,8 @@ as objects that implement the ``transform`` method:
    :class:`SelectFdr`, or family wise error :class:`SelectFwe`.
 
  * :class:`GenericUnivariateSelect` allows to perform univariate feature
-    selection with a configurable strategy. This allows to select the best
-    univariate selection strategy with hyper-parameter search estimator.
+   selection with a configurable strategy. This allows to select the best
+   univariate selection strategy with hyper-parameter search estimator.
 
 For instance, we can perform a :math:`\chi^2` test to the samples
 to retrieve only the two best features as follows:
@@ -84,17 +84,24 @@ to retrieve only the two best features as follows:
   >>> X_new.shape
   (150, 2)
 
-These objects take as input a scoring function that returns
-univariate p-values:
+These objects take as input a scoring function that returns univariate scores
+and p-values (or only scores for :class:`SelectKBest` and
+:class:`SelectPercentile`):
 
- * For regression: :func:`f_regression`
+ * For regression: :func:`f_regression`, :func:`mutual_info_regression`
 
- * For classification: :func:`chi2` or :func:`f_classif`
+ * For classification: :func:`chi2`, :func:`f_classif`, :func:`mutual_info_classif`
+
+The methods based on F-test estimate the degree of linear dependency between
+two random variables. On the other hand, mutual information methods can capture
+any kind of statistical dependency, but being nonparametric, they require more
+samples for accurate estimation.
 
 .. topic:: Feature selection with sparse data
 
    If you use sparse data (i.e. data represented as sparse matrices),
-   only :func:`chi2` will deal with the data without making it dense.
+   :func:`chi2`, :func:`mutual_info_regression`, :func:`mutual_info_classif`
+   will deal with the data without making it dense.
 
 .. warning::
 
@@ -103,7 +110,9 @@ univariate p-values:
 
 .. topic:: Examples:
 
-    :ref:`example_feature_selection_plot_feature_selection.py`
+    * :ref:`example_feature_selection_plot_feature_selection.py`
+
+    * :ref:`example_feature_selection_plot_f_test_vs_mi.py`
 
 .. _rfe:
 
@@ -315,4 +324,4 @@ Then, a :class:`sklearn.ensemble.RandomForestClassifier` is trained on the
 transformed output, i.e. using only relevant features. You can perform
 similar operations with the other feature selection methods and also
 classifiers that provide a way to evaluate feature importances of course.
-See the :class:`sklearn.pipeline.Pipeline` examples for more details.
+See the :class:`sklearn.pipeline.Pipeline` examples for more details.
\ No newline at end of file
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 720d7c3cb6ca7..e6fb4f8fc4ffa 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -15,6 +15,13 @@ Changelog
 New features
 ............
 
+   - Added two functions for mutual information estimation:
+     :func:`feature_selection.mutual_info_classif` and 
+     :func:`feature_selection.mutual_info_regression`. These functions can be
+     used in :class:`feature_selection.SelectKBest` and
+     :class:`feature_selection.SelectPercentile`, which now accept callable
+     returning only `scores`. By `Andrea Bravi`_ and `Nikolay Mayorov`_.
+
    - The Gaussian Process module has been reimplemented and now offers classification
      and regression estimators through :class:`gaussian_process.GaussianProcessClassifier`
      and  :class:`gaussian_process.GaussianProcessRegressor`. Among other things, the new
@@ -4037,3 +4044,6 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Imaculate: https://github.com/Imaculate
 
 .. _Bernardo Stein: https://github.com/DanielSidhion
+
+.. _Andrea Bravi: https://github.com/AndreaBravi
+
diff --git a/examples/feature_selection/plot_f_test_vs_mi.py b/examples/feature_selection/plot_f_test_vs_mi.py
new file mode 100644
index 0000000000000..7917f19962dc6
--- /dev/null
+++ b/examples/feature_selection/plot_f_test_vs_mi.py
@@ -0,0 +1,49 @@
+"""
+===========================================
+Comparison of F-test and mutual information
+===========================================
+
+This example illustrates the differences between univariate F-test statistics
+and mutual information.
+
+We consider 3 features x_1, x_2, x_3 distributed uniformly over [0, 1], the
+target depends on them as follows:
+
+y = x_1 + sin(6 * pi * x_2) + 0.1 * N(0, 1), that is the third features is completely irrelevant.
+
+The code below plots the dependency of y against individual x_i and normalized
+values of univariate F-tests statistics and mutual information.
+
+As F-test captures only linear dependency, it rates x_1 as the most
+discriminative feature. On the other hand, mutual information can capture any
+kind of dependency between variables and it rates x_2 as the most
+discriminative feature, which probably agrees better with our intuitive
+perception for this example. Both methods correctly marks x_3 as irrelevant.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.feature_selection import f_regression, mutual_info_regression
+
+np.random.seed(0)
+X = np.random.rand(1000, 3)
+y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)
+
+f_test, _ = f_regression(X, y)
+f_test /= np.max(f_test)
+
+mi = mutual_info_regression(X, y)
+mi /= np.max(mi)
+
+plt.figure(figsize=(15, 5))
+for i in range(3):
+    plt.subplot(1, 3, i + 1)
+    plt.scatter(X[:, i], y)
+    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
+    if i == 0:
+        plt.ylabel("$y$", fontsize=14)
+    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
+              fontsize=16)
+plt.show()
+
diff --git a/examples/feature_selection/plot_rfe_digits.py b/examples/feature_selection/plot_rfe_digits.py
index 626a25afef231..2427944a2f112 100644
--- a/examples/feature_selection/plot_rfe_digits.py
+++ b/examples/feature_selection/plot_rfe_digits.py
@@ -33,4 +33,4 @@
 plt.matshow(ranking, cmap=plt.cm.Blues)
 plt.colorbar()
 plt.title("Ranking of pixels with RFE")
-plt.show()
+plt.show()
\ No newline at end of file
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
index acb03f6f24a9e..ffa392b5b26db 100644
--- a/sklearn/feature_selection/__init__.py
+++ b/sklearn/feature_selection/__init__.py
@@ -22,6 +22,9 @@
 
 from .from_model import SelectFromModel
 
+from .mutual_info_ import mutual_info_regression, mutual_info_classif
+
+
 __all__ = ['GenericUnivariateSelect',
            'RFE',
            'RFECV',
@@ -29,10 +32,12 @@
            'SelectFpr',
            'SelectFwe',
            'SelectKBest',
+           'SelectFromModel',
            'SelectPercentile',
            'VarianceThreshold',
            'chi2',
            'f_classif',
            'f_oneway',
            'f_regression',
-           'SelectFromModel']
+           'mutual_info_classif',
+           'mutual_info_regression']
diff --git a/sklearn/feature_selection/mutual_info_.py b/sklearn/feature_selection/mutual_info_.py
new file mode 100644
index 0000000000000..0b205c2011c7a
--- /dev/null
+++ b/sklearn/feature_selection/mutual_info_.py
@@ -0,0 +1,438 @@
+# Author: Nikolay Mayorov <n59_ru@hotmail.com>
+# License: 3-clause BSD
+from __future__ import division
+
+import numpy as np
+from scipy.sparse import issparse
+from scipy.special import digamma
+
+from ..externals.six import moves
+from ..metrics.cluster.supervised import mutual_info_score
+from ..neighbors import NearestNeighbors
+from ..preprocessing import scale
+from ..utils import check_random_state
+from ..utils.validation import check_X_y
+from ..utils.multiclass import check_classification_targets
+
+
+def _compute_mi_cc(x, y, n_neighbors):
+    """Compute mutual information between two continuous variables.
+
+    Parameters
+    ----------
+    x, y : ndarray, shape (n_samples,)
+        Samples of two continuous random variables, must have an identical
+        shape.
+
+    n_neighbors : int
+        Number of nearest neighbors to search for each point, see [1]_.
+
+    Returns
+    -------
+    mi : float
+        Estimated mutual information. If it turned out to be negative it is
+        replace by 0.
+
+    Notes
+    -----
+    True mutual information can't be negative. If its estimate by a numerical
+    method is negative, it means (providing the method is adequate) that the
+    mutual information is close to 0 and replacing it by 0 is a reasonable
+    strategy.
+
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    """
+    n_samples = x.size
+
+    x = x.reshape((-1, 1))
+    y = y.reshape((-1, 1))
+    xy = np.hstack((x, y))
+
+    # Here we rely on NearestNeighbors to select the fastest algorithm.
+    nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)
+
+    nn.fit(xy)
+    radius = nn.kneighbors()[0]
+    radius = np.nextafter(radius[:, -1], 0)
+
+    # Algorithm is selected explicitly to allow passing an array as radius
+    # later (not all algorithms support this).
+    nn.set_params(algorithm='kd_tree')
+
+    nn.fit(x)
+    ind = nn.radius_neighbors(radius=radius, return_distance=False)
+    nx = np.array([i.size for i in ind])
+
+    nn.fit(y)
+    ind = nn.radius_neighbors(radius=radius, return_distance=False)
+    ny = np.array([i.size for i in ind])
+
+    mi = (digamma(n_samples) + digamma(n_neighbors) -
+          np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
+
+    return max(0, mi)
+
+
+def _compute_mi_cd(c, d, n_neighbors):
+    """Compute mutual information between continuous and discrete variables.
+
+    Parameters
+    ----------
+    c : ndarray, shape (n_samples,)
+        Samples of a continuous random variable.
+
+    d : ndarray, shape (n_samples,)
+        Samples of a discrete random variable.
+
+    n_neighbors : int
+        Number of nearest neighbors to search for each point, see [1]_.
+
+    Returns
+    -------
+    mi : float
+        Estimated mutual information. If it turned out to be negative it is
+        replace by 0.
+
+    Notes
+    -----
+    True mutual information can't be negative. If its estimate by a numerical
+    method is negative, it means (providing the method is adequate) that the
+    mutual information is close to 0 and replacing it by 0 is a reasonable
+    strategy.
+
+    References
+    ----------
+    .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
+       Data Sets". PLoS ONE 9(2), 2014.
+    """
+    n_samples = c.shape[0]
+    c = c.reshape((-1, 1))
+
+    radius = np.empty(n_samples)
+    label_counts = np.empty(n_samples)
+    k_all = np.empty(n_samples)
+    nn = NearestNeighbors()
+    for label in np.unique(d):
+        mask = d == label
+        count = np.sum(mask)
+        if count > 1:
+            k = min(n_neighbors, count - 1)
+            nn.set_params(n_neighbors=k)
+            nn.fit(c[mask])
+            r = nn.kneighbors()[0]
+            radius[mask] = np.nextafter(r[:, -1], 0)
+            k_all[mask] = k
+        label_counts[mask] = count
+
+    # Ignore points with unique labels.
+    mask = label_counts > 1
+    n_samples = np.sum(mask)
+    label_counts = label_counts[mask]
+    k_all = k_all[mask]
+    c = c[mask]
+    radius = radius[mask]
+
+    nn.set_params(algorithm='kd_tree')
+    nn.fit(c)
+    ind = nn.radius_neighbors(radius=radius, return_distance=False)
+    m_all = np.array([i.size for i in ind])
+
+    mi = (digamma(n_samples) + np.mean(digamma(k_all)) -
+          np.mean(digamma(label_counts)) -
+          np.mean(digamma(m_all + 1)))
+
+    return max(0, mi)
+
+
+def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
+    """Compute mutual information between two variables.
+
+    This is a simple wrapper which selects a proper function to call based on
+    whether `x` and `y` are discrete or not.
+    """
+    if x_discrete and y_discrete:
+        return mutual_info_score(x, y)
+    elif x_discrete and not y_discrete:
+        return _compute_mi_cd(y, x, n_neighbors)
+    elif not x_discrete and y_discrete:
+        return _compute_mi_cd(x, y, n_neighbors)
+    else:
+        return _compute_mi_cc(x, y, n_neighbors)
+
+
+def _iterate_columns(X, columns=None):
+    """Iterate over columns of a matrix.
+
+    Parameters
+    ----------
+    X : ndarray or csc_matrix, shape (n_samples, n_features)
+        Matrix over which to iterate.
+
+    columns : iterable or None, default None
+        Indices of columns to iterate over. If None, iterate over all columns.
+
+    Yields
+    ------
+    x : ndarray, shape (n_samples,)
+        Columns of `X` in dense format.
+    """
+    if columns is None:
+        columns = range(X.shape[1])
+
+    if issparse(X):
+        for i in columns:
+            x = np.zeros(X.shape[0])
+            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
+            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
+            yield x
+    else:
+        for i in columns:
+            yield X[:, i]
+
+
+def _estimate_mi(X, y, discrete_features='auto', discrete_target=False,
+                 n_neighbors=3, copy=True, random_state=None):
+    """Estimate mutual information between the features and the target.
+
+    Parameters
+    ----------
+    X : array_like or sparse matrix, shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array_like, shape (n_samples,)
+        Target vector.
+
+    discrete_features : {'auto', bool, array_like}, default 'auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    discrete_target : bool, default False
+        Whether to consider `y` as a discrete variable.
+
+    n_neighbors : int, default 3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [1]_ and [2]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int seed, RandomState instance or None, default None
+        The seed of the pseudo random number generator for adding small noise
+        to continuous variables in order to remove repeated values.
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target.
+        A negative value will be replaced by 0.
+
+    References
+    ----------
+    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    """
+    X, y = check_X_y(X, y, accept_sparse='csc', y_numeric=not discrete_target)
+    n_samples, n_features = X.shape
+
+    if discrete_features == 'auto':
+        discrete_features = issparse(X)
+
+    if isinstance(discrete_features, bool):
+        discrete_mask = np.empty(n_features, dtype=bool)
+        discrete_mask.fill(discrete_features)
+    else:
+        discrete_features = np.asarray(discrete_features)
+        if discrete_features.dtype != 'bool':
+            discrete_mask = np.zeros(n_features, dtype=bool)
+            discrete_mask[discrete_features] = True
+        else:
+            discrete_mask = discrete_features
+
+    continuous_mask = ~discrete_mask
+    if np.any(continuous_mask) and issparse(X):
+        raise ValueError("Sparse matrix `X` can't have continuous features.")
+
+    rng = check_random_state(random_state)
+    if np.any(continuous_mask):
+        if copy:
+            X = X.copy()
+
+        if not discrete_target:
+            X[:, continuous_mask] = scale(X[:, continuous_mask],
+                                          with_mean=False, copy=False)
+
+        # Add small noise to continuous features as advised in Kraskov et. al.
+        X = X.astype(float)
+        means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
+        X[:, continuous_mask] += 1e-10 * means * rng.randn(
+                n_samples, np.sum(continuous_mask))
+
+    if not discrete_target:
+        y = scale(y, with_mean=False)
+        y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples)
+
+    mi = [_compute_mi(x, y, discrete_feature, discrete_target) for
+          x, discrete_feature in moves.zip(_iterate_columns(X), discrete_mask)]
+
+    return np.array(mi)
+
+
+def mutual_info_regression(X, y, discrete_features='auto', n_neighbors=3,
+                           copy=True, random_state=None):
+    """Estimate mutual information for a continuous target variable.
+
+    Mutual information (MI) [1]_ between two random variables is a non-negative
+    value, which measures the dependency between the variables. It is equal
+    to zero if and only if two random variables are independent, and higher
+    values mean higher dependency.
+
+    The function relies on nonparametric methods based on entropy estimation
+    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
+    methods are based on the idea originally proposed in [4]_.
+
+    It can be used for univariate features selection, read more in the
+    :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : array_like or sparse matrix, shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array_like, shape (n_samples,)
+        Target vector.
+
+    discrete_features : {'auto', bool, array_like}, default 'auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    n_neighbors : int, default 3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int seed, RandomState instance or None, default None
+        The seed of the pseudo random number generator for adding small noise
+        to continuous variables in order to remove repeated values.
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target.
+
+    Notes
+    -----
+    1. The term "discrete features" is used instead of naming them
+       "categorical", because it describes the essence more accurately.
+       For example, pixel intensities of an image are discrete features
+       (but hardly categorical) and you will get better results if mark them
+       as such. Also note, that treating a continuous variable as discrete and
+       vice versa will usually give incorrect results, so be attentive about that.
+    2. True mutual information can't be negative. If its estimate turns out
+       to be negative, it is replaced by zero.
+
+    References
+    ----------
+    .. [1] `Mutual Information <http://en.wikipedia.org/wiki/Mutual_information>`_
+           on Wikipedia.
+    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
+           of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
+    """
+    return _estimate_mi(X, y, discrete_features, False, n_neighbors,
+                        copy, random_state)
+
+
+def mutual_info_classif(X, y, discrete_features='auto', n_neighbors=3,
+                        copy=True, random_state=None):
+    """Estimate mutual information for a discrete target variable.
+
+    Mutual information (MI) [1]_ between two random variables is a non-negative
+    value, which measures the dependency between the variables. It is equal
+    to zero if and only if two random variables are independent, and higher
+    values mean higher dependency.
+
+    The function relies on nonparametric methods based on entropy estimation
+    from k-nearest neighbors distances as described in [2]_ and [3]_. Both
+    methods are based on the idea originally proposed in [4]_.
+
+    It can be used for univariate features selection, read more in the
+    :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : array_like or sparse matrix, shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array_like, shape (n_samples,)
+        Target vector.
+
+    discrete_features : {'auto', bool, array_like}, default 'auto'
+        If bool, then determines whether to consider all features discrete
+        or continuous. If array, then it should be either a boolean mask
+        with shape (n_features,) or array with indices of discrete features.
+        If 'auto', it is assigned to False for dense `X` and to True for
+        sparse `X`.
+
+    n_neighbors : int, default 3
+        Number of neighbors to use for MI estimation for continuous variables,
+        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
+        could introduce a bias.
+
+    copy : bool, default True
+        Whether to make a copy of the given data. If set to False, the initial
+        data will be overwritten.
+
+    random_state : int seed, RandomState instance or None, default None
+        The seed of the pseudo random number generator for adding small noise
+        to continuous variables in order to remove repeated values.
+
+    Returns
+    -------
+    mi : ndarray, shape (n_features,)
+        Estimated mutual information between each feature and the target.
+
+    Notes
+    -----
+    1. The term "discrete features" is used instead of naming them
+       "categorical", because it describes the essence more accurately.
+       For example, pixel intensities of an image are discrete features
+       (but hardly categorical) and you will get better results if mark them
+       as such. Also note, that treating a continuous variable as discrete and
+       vice versa will usually give incorrect results, so be attentive about that.
+    2. True mutual information can't be negative. If its estimate turns out
+       to be negative, it is replaced by zero.
+
+    References
+    ----------
+    .. [1] `Mutual Information <http://en.wikipedia.org/wiki/Mutual_information>`_
+           on Wikipedia.
+    .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+           information". Phys. Rev. E 69, 2004.
+    .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
+           Data Sets". PLoS ONE 9(2), 2014.
+    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
+           of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
+    """
+    check_classification_targets(y)
+    return _estimate_mi(X, y, discrete_features, True, n_neighbors,
+                        copy, random_state)
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index 204d7c2e25dba..d2a73334299d5 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -7,6 +7,7 @@
 import numpy as np
 from scipy import stats, sparse
 
+from numpy.testing import run_module_suite
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
@@ -24,10 +25,10 @@
 
 from sklearn.datasets.samples_generator import (make_classification,
                                                 make_regression)
-from sklearn.feature_selection import (chi2, f_classif, f_oneway, f_regression,
-                                       SelectPercentile, SelectKBest,
-                                       SelectFpr, SelectFdr, SelectFwe,
-                                       GenericUnivariateSelect)
+from sklearn.feature_selection import (
+    chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
+    mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
+    SelectFdr, SelectFwe, GenericUnivariateSelect)
 
 
 ##############################################################################
@@ -556,3 +557,65 @@ def test_no_feature_selected():
         X_selected = assert_warns_message(
             UserWarning, 'No features were selected', selector.transform, X)
         assert_equal(X_selected.shape, (40, 0))
+
+
+def test_mutual_info_classif():
+    X, y = make_classification(n_samples=100, n_features=5,
+                               n_informative=1, n_redundant=1,
+                               n_repeated=0, n_classes=2,
+                               n_clusters_per_class=1, flip_y=0.0,
+                               class_sep=10, shuffle=False, random_state=0)
+
+    # Test in KBest mode.
+    univariate_filter = SelectKBest(mutual_info_classif, k=2)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = GenericUnivariateSelect(
+        mutual_info_classif, mode='k_best', param=2).fit(X, y).transform(X)
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(5)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+    # Test in Percentile mode.
+    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = GenericUnivariateSelect(
+        mutual_info_classif, mode='percentile', param=40).fit(X, y).transform(X)
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(5)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+
+def test_mutual_info_regression():
+    X, y = make_regression(n_samples=100, n_features=10, n_informative=2,
+                           shuffle=False, random_state=0, noise=10)
+
+    # Test in KBest mode.
+    univariate_filter = SelectKBest(mutual_info_regression, k=2)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    assert_best_scores_kept(univariate_filter)
+    X_r2 = GenericUnivariateSelect(
+        mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X)
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(10)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+    # Test in Percentile mode.
+    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
+    X_r = univariate_filter.fit(X, y).transform(X)
+    X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile',
+                                   param=20).fit(X, y).transform(X)
+    assert_array_equal(X_r, X_r2)
+    support = univariate_filter.get_support()
+    gtruth = np.zeros(10)
+    gtruth[:2] = 1
+    assert_array_equal(support, gtruth)
+
+
+if __name__ == '__main__':
+    run_module_suite()
diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py
new file mode 100644
index 0000000000000..f9b86777dcbe3
--- /dev/null
+++ b/sklearn/feature_selection/tests/test_mutual_info.py
@@ -0,0 +1,193 @@
+from __future__ import division
+
+import numpy as np
+from numpy.testing import run_module_suite
+from scipy.sparse import csr_matrix
+
+from sklearn.utils.testing import (assert_array_equal, assert_almost_equal,
+                                   assert_false, assert_raises, assert_equal)
+from sklearn.feature_selection.mutual_info_ import (
+    mutual_info_regression, mutual_info_classif, _compute_mi)
+
+
+def test_compute_mi_dd():
+    # In discrete case computations are straightforward and can be done
+    # by hand on given vectors.
+    x = np.array([0, 1, 1, 0, 0])
+    y = np.array([1, 0, 0, 0, 1])
+
+    H_x = H_y = -(3/5) * np.log(3/5) - (2/5) * np.log(2/5)
+    H_xy = -1/5 * np.log(1/5) - 2/5 * np.log(2/5) - 2/5 * np.log(2/5)
+    I_xy = H_x + H_y - H_xy
+
+    assert_almost_equal(_compute_mi(x, y, True, True), I_xy)
+
+
+def test_compute_mi_cc():
+    # For two continuous variables a good approach is to test on bivariate
+    # normal distribution, where mutual information is known.
+
+    # Mean of the distribution, irrelevant for mutual information.
+    mean = np.zeros(2)
+
+    # Setup covariance matrix with correlation coeff. equal 0.5.
+    sigma_1 = 1
+    sigma_2 = 10
+    corr = 0.5
+    cov = np.array([
+        [sigma_1**2, corr * sigma_1 * sigma_2],
+        [corr * sigma_1 * sigma_2, sigma_2**2]
+    ])
+
+    # True theoretical mutual information.
+    I_theory = (np.log(sigma_1) + np.log(sigma_2) -
+                0.5 * np.log(np.linalg.det(cov)))
+
+    np.random.seed(0)
+    Z = np.random.multivariate_normal(mean, cov, size=1000)
+
+    x, y = Z[:, 0], Z[:, 1]
+
+    # Theory and computed values won't be very close, assert that the
+    # first figures after decimal point match.
+    for n_neighbors in [3, 5, 7]:
+        I_computed = _compute_mi(x, y, False, False, n_neighbors)
+        assert_almost_equal(I_computed, I_theory, 1)
+
+
+def test_compute_mi_cd():
+    # To test define a joint distribution as follows:
+    # p(x, y) = p(x) p(y | x)
+    # X ~ Bernoulli(p)
+    # (Y | x = 0) ~ Uniform(-1, 1)
+    # (Y | x = 1) ~ Uniform(0, 2)
+
+    # Use the following formula for mutual information:
+    # I(X; Y) = H(Y) - H(Y | X)
+    # Two entropies can be computed by hand:
+    # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
+    # H(Y | X) = ln(2)
+
+    # Now we need to implement sampling from out distribution, which is
+    # done easily using conditional distribution logic.
+
+    n_samples = 1000
+    np.random.seed(0)
+
+    for p in [0.3, 0.5, 0.7]:
+        x = np.random.uniform(size=n_samples) > p
+
+        y = np.empty(n_samples)
+        mask = x == 0
+        y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+        y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+        I_theory = -0.5 * ((1 - p) * np.log(0.5 * (1 - p)) +
+                           p * np.log(0.5 * p) + np.log(0.5)) - np.log(2)
+
+        # Assert the same tolerance.
+        for n_neighbors in [3, 5, 7]:
+            I_computed = _compute_mi(x, y, True, False, n_neighbors)
+            assert_almost_equal(I_computed, I_theory, 1)
+
+
+def test_compute_mi_cd_unique_label():
+    # Test that adding unique label doesn't change MI.
+    n_samples = 100
+    x = np.random.uniform(size=n_samples) > 0.5
+
+    y = np.empty(n_samples)
+    mask = x == 0
+    y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+    y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+    mi_1 = _compute_mi(x, y, True, False)
+
+    x = np.hstack((x, 2))
+    y = np.hstack((y, 10))
+    mi_2 = _compute_mi(x, y, True, False)
+
+    assert_equal(mi_1, mi_2)
+
+
+# We are going test that feature ordering by MI matches our expectations.
+def test_mutual_info_classif_discrete():
+    X = np.array([[0, 0, 0],
+                  [1, 1, 0],
+                  [2, 0, 1],
+                  [2, 0, 1],
+                  [2, 0, 1]])
+    y = np.array([0, 1, 2, 2, 1])
+
+    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
+    # informative.
+    mi = mutual_info_classif(X, y, discrete_features=True)
+    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
+
+
+def test_mutual_info_regression():
+    # We generate sample from multivariate normal distribution, using
+    # transformation from initially uncorrelated variables. The zero
+    # variables after transformation is selected as the target vector,
+    # it has the strongest correlation with the variable 2, and
+    # the weakest correlation with the variable 1.
+    T = np.array([
+        [1, 0.5, 2, 1],
+        [0, 1, 0.1, 0.0],
+        [0, 0.1, 1, 0.1],
+        [0, 0.1, 0.1, 1]
+    ])
+    cov = T.dot(T.T)
+    mean = np.zeros(4)
+
+    np.random.seed(0)
+    Z = np.random.multivariate_normal(mean, cov, size=1000)
+    X = Z[:, 1:]
+    y = Z[:, 0]
+
+    mi = mutual_info_regression(X, y, random_state=0)
+    assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))
+
+
+def test_mutual_info_classif_mixed():
+    # Here the target is discrete and there are two continuous and one
+    # discrete feature. The idea of this test is clear from the code.
+    np.random.seed(0)
+    X = np.random.rand(1000, 3)
+    X[:, 1] += X[:, 0]
+    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
+    X[:, 2] = X[:, 2] > 0.5
+
+    mi = mutual_info_classif(X, y, discrete_features=[2], random_state=0)
+    assert_array_equal(np.argsort(-mi), [2, 0, 1])
+
+
+def test_mutual_info_options():
+    X = np.array([[0, 0, 0],
+                  [1, 1, 0],
+                  [2, 0, 1],
+                  [2, 0, 1],
+                  [2, 0, 1]], dtype=float)
+    y = np.array([0, 1, 2, 2, 1], dtype=float)
+    X_csr = csr_matrix(X)
+
+    for mutual_info in (mutual_info_regression, mutual_info_classif):
+        assert_raises(ValueError, mutual_info_regression, X_csr, y,
+                      discrete_features=False)
+
+        mi_1 = mutual_info(X, y, discrete_features='auto', random_state=0)
+        mi_2 = mutual_info(X, y, discrete_features=False, random_state=0)
+
+        mi_3 = mutual_info(X_csr, y, discrete_features='auto',
+                           random_state=0)
+        mi_4 = mutual_info(X_csr, y, discrete_features=True,
+                           random_state=0)
+
+        assert_array_equal(mi_1, mi_2)
+        assert_array_equal(mi_3, mi_4)
+
+    assert_false(np.allclose(mi_1, mi_3))
+
+
+if __name__ == '__main__':
+    run_module_suite()
diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
index 9bd8ca273a8dc..d365ec221575f 100644
--- a/sklearn/feature_selection/univariate_selection.py
+++ b/sklearn/feature_selection/univariate_selection.py
@@ -295,7 +295,7 @@ class _BaseFilter(BaseEstimator, SelectorMixin):
     ----------
     score_func : callable
         Function taking two arrays X and y, and returning a pair of arrays
-        (scores, pvalues).
+        (scores, pvalues) or a single array with scores.
     """
 
     def __init__(self, score_func):
@@ -326,10 +326,16 @@ def fit(self, X, y):
                             % (self.score_func, type(self.score_func)))
 
         self._check_params(X, y)
+        score_func_ret = self.score_func(X, y)
+        if isinstance(score_func_ret, (list, tuple)):
+            self.scores_, self.pvalues_ = score_func_ret
+            self.pvalues_ = np.asarray(self.pvalues_)
+        else:
+            self.scores_ = score_func_ret
+            self.pvalues_ = None
 
-        self.scores_, self.pvalues_ = self.score_func(X, y)
         self.scores_ = np.asarray(self.scores_)
-        self.pvalues_ = np.asarray(self.pvalues_)
+
         return self
 
     def _check_params(self, X, y):
@@ -348,7 +354,7 @@ class SelectPercentile(_BaseFilter):
     ----------
     score_func : callable
         Function taking two arrays X and y, and returning a pair of arrays
-        (scores, pvalues).
+        (scores, pvalues) or a single array with scores.
 
     percentile : int, optional, default=10
         Percent of features to keep.
@@ -359,7 +365,7 @@ class SelectPercentile(_BaseFilter):
         Scores of features.
 
     pvalues_ : array-like, shape=(n_features,)
-        p-values of feature scores.
+        p-values of feature scores, None if `score_func` returned only scores.
 
     Notes
     -----
@@ -369,8 +375,10 @@ class SelectPercentile(_BaseFilter):
     See also
     --------
     f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continuous target.
     SelectKBest: Select features based on the k highest scores.
     SelectFpr: Select features based on a false positive rate test.
     SelectFdr: Select features based on an estimated false discovery rate.
@@ -417,7 +425,7 @@ class SelectKBest(_BaseFilter):
     ----------
     score_func : callable
         Function taking two arrays X and y, and returning a pair of arrays
-        (scores, pvalues).
+        (scores, pvalues) or a single array with scores.
 
     k : int or "all", optional, default=10
         Number of top features to select.
@@ -429,7 +437,7 @@ class SelectKBest(_BaseFilter):
         Scores of features.
 
     pvalues_ : array-like, shape=(n_features,)
-        p-values of feature scores.
+        p-values of feature scores, None if `score_func` returned only scores.
 
     Notes
     -----
@@ -439,8 +447,10 @@ class SelectKBest(_BaseFilter):
     See also
     --------
     f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continious target.
     SelectPercentile: Select features based on percentile of the highest scores.
     SelectFpr: Select features based on a false positive rate test.
     SelectFdr: Select features based on an estimated false discovery rate.
@@ -504,7 +514,9 @@ class SelectFpr(_BaseFilter):
     --------
     f_classif: ANOVA F-value between label/feature for classification tasks.
     chi2: Chi-squared stats of non-negative features for classification tasks.
+    mutual_info_classif:
     f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information between features and the target.
     SelectPercentile: Select features based on percentile of the highest scores.
     SelectKBest: Select features based on the k highest scores.
     SelectFdr: Select features based on an estimated false discovery rate.
@@ -555,8 +567,10 @@ class SelectFdr(_BaseFilter):
     See also
     --------
     f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a contnuous target.
     SelectPercentile: Select features based on percentile of the highest scores.
     SelectKBest: Select features based on the k highest scores.
     SelectFpr: Select features based on a false positive rate test.
@@ -639,7 +653,8 @@ class GenericUnivariateSelect(_BaseFilter):
     ----------
     score_func : callable
         Function taking two arrays X and y, and returning a pair of arrays
-        (scores, pvalues).
+        (scores, pvalues). For modes 'percentile' or 'kbest' it can return
+        a single array scores.
 
     mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}
         Feature selection mode.
@@ -653,13 +668,15 @@ class GenericUnivariateSelect(_BaseFilter):
         Scores of features.
 
     pvalues_ : array-like, shape=(n_features,)
-        p-values of feature scores.
+        p-values of feature scores, None if `score_func` returned scores only.
 
     See also
     --------
     f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continuous target.
     SelectPercentile: Select features based on percentile of the highest scores.
     SelectKBest: Select features based on the k highest scores.
     SelectFpr: Select features based on a false positive rate test.