scikit-learn · rth · Aug 1, 2019 · Aug 1, 2019 · Aug 1, 2019 · Aug 31, 2020
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
@@ -9,8 +9,7 @@
 # License: BSD 3 clause (C) INRIA, University of Amsterdam
 
 import numpy as np
-from scipy import stats
-from ..utils.extmath import weighted_mode
+from ..utils.extmath import _fast_mode
 from ..utils.validation import _is_arraylike, _num_samples
 
 import warnings
@@ -207,11 +206,7 @@ def predict(self, X):
 
         y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
         for k, classes_k in enumerate(classes_):
-            if weights is None:
-                mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
-            else:
-                mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
-
+            mode = _fast_mode(_y[neigh_ind, k], weights=weights, axis=1)
             mode = np.asarray(mode.ravel(), dtype=np.intp)
             y_pred[:, k] = classes_k.take(mode)
 

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
@@ -374,6 +374,74 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
+def _fast_mode(x, weights=None, *, axis=1):
+    """Returns a faster equivalent for scipy.mode
+
+    This is only implemented for positive integer data.
+
+    Parameters
+    ----------
+    x : array_like, shape (n_samples, n_components)
+        n-dimensional array of which to find mode(s).
+    w : array_like, shape (n_samples, n_components)
+        n-dimensional array of weights for each value
+    axis : int, optional
+        Axis along which to operate. Default is 1.
+        Only axis=1 is supported.
+
+    Returns
+    -------
+    mode: ndarray, shape=(n_samples)
+        index of the mode
+
+    Examples
+    --------
+    >>> x = np.array([[0, 1, 1], [2, 0, 2]])
+    >>> _fast_mode(x, axis=1)
+    array([[1], [2]])
+
+    Next we illustrate weighted mode calculations
+
+    >>> x = np.array([[4, 1, 4, 2, 4, 2]])
+    >>> weights = np.array([[1, 1, 1, 1, 1, 1]])
+    >>> _fast_mode(x, weights)
+    array([[4]])
+
+    The value 4 appears three times: with uniform weights, the result is
+    simply the mode of the distribution.
+
+    >>> weights = np.array([[1, 3, 0.5, 1.5, 1, 2]])  # deweight the 4's
+    >>> _fast_mode(x, weights)
+    array([[2]])
+
+    The value 2 has the highest score: it appears twice with weights of
+    1.5 and 2: the sum of these is 3.5.
+
+    """
+    if not hasattr(x, "__array__") or x.dtype.kind != 'i' or x.ndim != 2:
+        raise ValueError('_fast_mode is only implemented for 2D integer '
+                         'arrays!')
+    if x.min() < 0:
+        raise ValueError('only positive data is supported.')
+
+    if weights is None:
+        data = np.ones(x.shape, dtype=np.int).ravel()
+    else:
+        if x.shape != weights.shape:
+            raise ValueError("x.shape {} !=  weights.shape {}"
+                             .format(x.shape, weights.shape))
+        data = np.ascontiguousarray(weights).ravel()
+    indices = np.ascontiguousarray(x).ravel()
+    indptr = np.arange(x.shape[0]+1)*x.shape[1]
+    # we use the fact that data for repeated indices is summed when
+    # creating sparse arrays. The index with highest value is then the mode
+    if axis != 1:
+        raise ValueError('Only axis=1 is supported.')
+    z = sparse.csr_matrix((data, indices, indptr),
+                          shape=(x.shape[0], x.max() + 1))
+    return np.asarray(np.argmax(z, axis=1))
+
+
 @_deprecate_positional_args
 def weighted_mode(a, w, *, axis=0):
     """Returns an array of the weighted modal (most common) value in a.

diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
@@ -32,7 +32,7 @@
 from sklearn.utils.extmath import _deterministic_vector_sign_flip
 from sklearn.utils.extmath import softmax
 from sklearn.utils.extmath import stable_cumsum
-from sklearn.utils.extmath import safe_sparse_dot
+from sklearn.utils.extmath import safe_sparse_dot, _fast_mode
 from sklearn.datasets import make_low_rank_matrix
 
 
@@ -637,6 +637,69 @@ def test_stable_cumsum():
     assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
 
 
+class TestFastMode():
+
+    def test_scipy_stats_axis_1(self):
+        rng = np.random.RandomState(0)
+
+        X = rng.randint(10, size=(100, 20))
+        mode_ref, _ = stats.mode(X, axis=1)
+        mode = _fast_mode(X, axis=1)
+        assert_array_equal(mode, mode_ref)
+
+    @pytest.mark.parametrize(
+            'x',
+            [np.ones((10, 10), dtype=np.float), 1, np.ones(5, dtype=np.int)],
+            ids=['array_float64', 'int', '1D-array'])
+    def test_input_validation(self, x):
+        with pytest.raises(ValueError,
+                           match='only implemented for 2D integer arrays'):
+            _fast_mode(x)
+
+    def test_negative_values(self):
+        x = - np.ones((10, 10), dtype=np.int)
+        with pytest.raises(ValueError,
+                           match="only positive data is supported"):
+            _fast_mode(x)
+
+    def test_ties(self):
+        # Check that ties are resolved in the same way as in stats.mode
+        X = np.ones((6, 9), dtype=np.int)
+        X[:, 3:] = 2
+        X[:, 6:] = 3
+        mode_ref, _ = stats.mode(X, axis=1)
+        mode = _fast_mode(X, axis=1)
+        assert_array_equal(mode, mode_ref)
+
+    def test_uniform_weights(self):
+        # with uniform weights, results should be identical to
+        # stats.mode
+        rng = np.random.RandomState(0)
+        x = rng.randint(10, size=(10, 5))
+        weights = np.ones(x.shape)
+
+        mode, _ = stats.mode(x, axis=1)
+        mode2 = _fast_mode(x, weights, axis=1)
+
+        assert_array_equal(mode, mode2)
+
+    def test_random_weights(self):
+        # set this up so that each row should have a weighted mode of 6,
+        # with a score that is easily reproduced
+        mode_result = 6
+
+        rng = np.random.RandomState(0)
+        x = rng.randint(mode_result, size=(100, 10))
+        w = rng.random_sample(x.shape)
+
+        x[:, :5] = mode_result
+        w[:, :5] += 1
+
+        mode = _fast_mode(x, w, axis=1)
+
+        assert_array_equal(mode, mode_result)
+
+
 @pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
                          ids=["dense", "sparse"])
 @pytest.mark.parametrize("B_array_constr", [np.array, sparse.csr_matrix],