scikit-learn · ashimb9 · Jul 13, 2017 · Jul 18, 2017 · Jul 18, 2017 · Jul 19, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1193,6 +1193,7 @@ See the :ref:`metrics` section of the user guide for further details.
    preprocessing.FunctionTransformer
    preprocessing.Imputer
    preprocessing.KernelCenterer
+   preprocessing.KNNImputer
    preprocessing.LabelBinarizer
    preprocessing.LabelEncoder
    preprocessing.MultiLabelBinarizer

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -8,7 +8,7 @@
 #          Lars Buitinck
 #          Joel Nothman <[email protected]>
 # License: BSD 3 clause
-
+from __future__ import division
 import itertools
 from functools import partial
 
@@ -29,6 +29,15 @@
 from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 
 
+# Get mask for missing values
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == missing_values."""
+    if value_to_mask == "NaN" or np.isnan(value_to_mask):
+        return np.isnan(X)
+    else:
+        return X == value_to_mask
+
+
 # Utility Functions
 def _return_float_dtype(X, Y):
     """
@@ -54,7 +63,9 @@ def _return_float_dtype(X, Y):
     return X, Y, dtype
 
 
-def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
+def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
+                          accept_sparse='csr', force_all_finite=True,
+                          copy=False):
     """ Set X and Y appropriately and checks inputs
 
     If Y is None, it is set as a pointer to X (i.e. not a copy).
@@ -84,6 +95,20 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
 
         .. versionadded:: 0.18
 
+    accept_sparse : string, boolean or list/tuple of strings
+        String[s] representing allowed sparse matrix formats, such as 'csc',
+        'csr', etc. If the input is sparse but not in the allowed format,
+        it will be converted to the first listed format. True allows the input
+        to be any format. False means that a sparse matrix input will
+        raise an error.
+
+    force_all_finite : bool
+        Whether to raise an error on np.inf and np.nan in X (or Y if it exists)
+
+    copy : bool
+        Whether a forced copy will be triggered. If copy=False, a copy might
+        be triggered by a conversion.
+
     Returns
     -------
     safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
@@ -102,12 +127,15 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
         dtype = dtype_float
 
     if Y is X or Y is None:
-        X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
+        X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
+                            copy=copy, force_all_finite=force_all_finite,
                             warn_on_dtype=warn_on_dtype, estimator=estimator)
     else:
-        X = check_array(X, accept_sparse='csr', dtype=dtype,
+        X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
+                        copy=copy, force_all_finite=force_all_finite,
                         warn_on_dtype=warn_on_dtype, estimator=estimator)
-        Y = check_array(Y, accept_sparse='csr', dtype=dtype,
+        Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype,
+                        copy=copy, force_all_finite=force_all_finite,
                         warn_on_dtype=warn_on_dtype, estimator=estimator)
 
     if precomputed:
@@ -217,7 +245,7 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
 
     See also
     --------
-    paired_distances : distances betweens pairs of elements of X and Y.
+    paired_distances : distances between pairs of elements of X and Y.
     """
     X, Y = check_pairwise_arrays(X, Y)
 
@@ -256,6 +284,124 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     return distances if squared else np.sqrt(distances, out=distances)
 
 
+def masked_euclidean_distances(X, Y=None, squared=False,
+                               missing_values="NaN", copy=True):
+    """Calculates euclidean distances in the presence of missing values
+
+    Considering the rows of X (and Y=X) as samples, compute the distance matrix
+    between each pair of samples. Similarly, if Y is not X, then compute the
+    distance matrix between each sample pair (i.e., each row pair) in X and Y.
+
+    When calculating the distance between a pair of samples, this formulation
+    essentially zero-weights feature coordinates with a missing value in either
+    sample and scales up the weight of the remaining coordinates:
+
+        dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates)
+        where,
+        weight = Total # of coordinates / # of non-missing coordinates
+
+    Note that if all the coordinates are missing or if there are no common
+    non-missing coordinates then NaN is returned for that pair.
+
+    Read more in the :ref:`User Guide <metrics>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
+
+    Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
+
+    squared : boolean, optional
+        Return squared Euclidean distances.
+
+    missing_values : "NaN" or integer, optional
+        Representation of missing value
+
+    copy : boolean, optional
+        Make and use a deep copy of X and Y (if Y exists)
+
+    Returns
+    -------
+    distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
+
+    Examples
+    --------
+    >>> from sklearn.metrics.pairwise import masked_euclidean_distances
+    >>> nan = float("NaN")
+    >>> X = [[0, 1], [1, nan]]
+    >>> # distance between rows of X
+    >>> masked_euclidean_distances(X, X)
+    array([[ 0.        ,  1.41421356],
+           [ 1.41421356,  0.        ]])
+
+    >>> # get distance to origin
+    >>> masked_euclidean_distances(X, [[0, 0]])
+    array([[ 1.        ],
+           [ 1.41421356]])
+
+    References
+    ----------
+    * John K. Dixon, "Pattern Recognition with Partly Missing Data",
+      IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
+      10, pp. 617 - 621, Oct. 1979.
+      http://ieeexplore.ieee.org/abstract/document/4310090/
+
+    See also
+    --------
+    paired_distances : distances betweens pairs of elements of X and Y.
+    """
+
+    # NOTE: force_all_finite=False allows not only NaN but also +/- inf
+    X, Y = check_pairwise_arrays(X, Y, accept_sparse=False,
+                                 force_all_finite=False, copy=copy)
+    if (np.any(np.isinf(X)) or
+            (Y is not X and np.any(np.isinf(Y)))):
+        raise ValueError(
+            "+/- Infinite values are not allowed.")
+
+    # Get missing mask for X and Y.T
+    mask_X = _get_mask(X, missing_values)
+
+    YT = Y.T
+    mask_YT = _get_mask(YT, missing_values)
+
+    # Check if any rows have only missing value
+    if np.any(mask_X.sum(axis=1) == X.shape[1])\
+            or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])):
+        raise ValueError("One or more rows only contain missing values.")
+
+    # else:
+    if missing_values != "NaN" and \
+            (np.any(np.isnan(X)) or
+                (Y is not X and np.any(np.isnan(Y)))):
+        raise ValueError(
+            "NaN values present but missing_value = {0}".format(
+                missing_values))
+
+    # Get anti-mask and set Y.T's missing to zero
+    NYT = (~mask_YT).astype(np.int32)
+    YT[mask_YT] = 0
+
+    # Get X anti-mask and set X's missing to zero
+    NX = (~mask_X).astype(np.int32)
+    X[mask_X] = 0
+
+    # Calculate distances
+    # The following formula was derived in matrix form by:
+    # Shreya Bhattarai <[email protected]>
+
+    distances = (X.shape[1] / (np.dot(NX, NYT))) * \
+                (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) +
+                 np.dot(NX, YT * YT))
+
+    if X is Y:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        distances.flat[::distances.shape[0] + 1] = 0.0
+
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
 def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
                                   batch_size=500, metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
@@ -1040,6 +1186,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
     'l1': manhattan_distances,
     'manhattan': manhattan_distances,
     'precomputed': None,  # HACK: precomputed is always allowed, never called
+    'masked_euclidean': masked_euclidean_distances,
 }
 
 
@@ -1052,16 +1199,17 @@ def distance_metrics():
 
     The valid distance metrics, and the function they map to, are:
 
-    ============     ====================================
-    metric           Function
-    ============     ====================================
-    'cityblock'      metrics.pairwise.manhattan_distances
-    'cosine'         metrics.pairwise.cosine_distances
-    'euclidean'      metrics.pairwise.euclidean_distances
-    'l1'             metrics.pairwise.manhattan_distances
-    'l2'             metrics.pairwise.euclidean_distances
-    'manhattan'      metrics.pairwise.manhattan_distances
-    ============     ====================================
+    ===================     ============================================
+    metric                  Function
+    ===================     ============================================
+    'cityblock'             metrics.pairwise.manhattan_distances
+    'cosine'                metrics.pairwise.cosine_distances
+    'euclidean'             metrics.pairwise.euclidean_distances
+    'l1'                    metrics.pairwise.manhattan_distances
+    'l2'                    metrics.pairwise.euclidean_distances
+    'manhattan'             metrics.pairwise.manhattan_distances
+    'masked_euclidean'      metrics.pairwise.masked_euclidean_distances
+    ===================     ============================================
 
     Read more in the :ref:`User Guide <metrics>`.
 
@@ -1128,7 +1276,10 @@ def _pairwise_callable(X, Y, metric, **kwds):
                   'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
                   'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
                   'russellrao', 'seuclidean', 'sokalmichener',
-                  'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"]
+                  'sokalsneath', 'sqeuclidean', 'yule', "wminkowski",
+                  'masked_euclidean']
+
+_MASKED_SUPPORTED_METRICS = ['masked_euclidean']
 
 
 def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
@@ -1149,6 +1300,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
 
     - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
       'manhattan']. These metrics support sparse matrix inputs.
+      Also, ['masked_euclidean'] but it does not yet support sparse matrices.
 
     - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
       'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
@@ -1216,6 +1368,14 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
                          "Valid metrics are %s, or 'precomputed', or a "
                          "callable" % (metric, _VALID_METRICS))
 
+    if metric in _MASKED_SUPPORTED_METRICS:
+        missing_values = kwds.get("missing_values") if kwds.get(
+            "missing_values") is not None else np.nan
+
+        if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])):
+            raise ValueError(
+                "One or more samples(s) only have missing values.")
+
     if metric == "precomputed":
         X, _ = check_pairwise_arrays(X, Y, precomputed=True)
         return X

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -17,6 +17,7 @@
 from sklearn.externals.six import iteritems
 
 from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.metrics.pairwise import masked_euclidean_distances
 from sklearn.metrics.pairwise import manhattan_distances
 from sklearn.metrics.pairwise import linear_kernel
 from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel
@@ -56,6 +57,15 @@ def test_pairwise_distances():
     S = pairwise_distances(X, Y, metric="euclidean")
     S2 = euclidean_distances(X, Y)
     assert_array_almost_equal(S, S2)
+    # Check to ensure NaNs work with pairwise_distances.
+    X_masked = rng.random_sample((5, 4))
+    Y_masked = rng.random_sample((2, 4))
+    X_masked[0, 0] = np.nan
+    Y_masked[0, 0] = np.nan
+    S_masked = pairwise_distances(X_masked, Y_masked,
+                                  metric="masked_euclidean")
+    S2_masked = masked_euclidean_distances(X_masked, Y_masked)
+    assert_array_almost_equal(S_masked, S2_masked)
     # Test with tuples as X and Y
     X_tuples = tuple([tuple([v for v in row]) for row in X])
     Y_tuples = tuple([tuple([v for v in row]) for row in Y])
@@ -407,6 +417,72 @@ def test_euclidean_distances():
     assert_greater(np.max(np.abs(wrong_D - D1)), .01)
 
 
+def test_masked_euclidean_distances():
+    # Check with pairs of matrices with missing values
+    X = np.array([[1., np.nan, 3., 4., 2.],
+                  [np.nan, 4., 6., 1., np.nan],
+                  [3., np.nan, np.nan, np.nan, 1.]])
+
+    Y = np.array([[np.nan, 7., 7., np.nan, 2.],
+                  [np.nan, np.nan, 5., 4., 7.],
+                  [np.nan, np.nan, np.nan, 4., 5.]])
+
+    D1 = masked_euclidean_distances(X, Y,  missing_values="NaN")
+    D2 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN")
+
+    assert_array_almost_equal(D1**2, D2)
+
+    # Check when squared = True
+    D3 = np.array(
+        [[40., 48.33333331, 22.5],
+         [25., 25., 45.],
+         [5., 180., 80.]])
+    D4 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN")
+
+    assert_array_almost_equal(D3, D4)
+
+    # Check with explicit formula and squared=True
+    assert_array_almost_equal(
+        masked_euclidean_distances(X[:1], Y[:1], squared=True),
+        [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]])
+
+    # Check when Y = X is explicitly passed
+    D5 = masked_euclidean_distances(X, missing_values="NaN")
+    D6 = masked_euclidean_distances(X, X, missing_values="NaN")
+    assert_array_almost_equal(D5, D6)
+
+    # Check with missing_value = 1 while NaN is present
+    assert_raises(ValueError, masked_euclidean_distances, X, Y,
+                  missing_values=1)
+    # Check with inf present
+    X_inf = np.array([
+        [np.inf, np.nan, 3., 4., 2.],
+        [np.nan, 4., 6., 1., np.nan],
+        [3., np.nan, np.nan, np.nan, 1.]])
+
+    assert_raises(ValueError, masked_euclidean_distances, X_inf, Y)
+
+    # Check with a row containing all NaNs
+    X_nan_row = np.array([
+        [1., np.nan, 3., 4., 2.],
+        [np.nan, 4., 6., 1., np.nan],
+        [np.nan, np.nan, np.nan, np.nan, np.nan]])
+
+    Y_nan_row = np.array([
+        [np.nan, 7., 7., np.nan, 2.],
+        [np.nan, np.nan, 5., 4., 7.],
+        [np.nan, np.nan, np.nan, np.nan, np.nan]])
+
+    assert_raises(ValueError, masked_euclidean_distances, X_nan_row, Y)
+    assert_raises(ValueError, masked_euclidean_distances, X, Y_nan_row)
+
+    # Check copy = True against copy = False
+    # Note: This test will alter X and Y
+    D7 = masked_euclidean_distances(X, Y, copy=True)
+    D8 = masked_euclidean_distances(X, Y, copy=False)
+    assert_array_almost_equal(D7, D8)
+
+
 def test_cosine_distances():
     # Check the pairwise Cosine distances computation
     rng = np.random.RandomState(1337)