Thanks to visit codestin.com
Credit goes to github.com

Skip to content

WIP PERF Faster KNeighborsClassifier.predict #14543

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions sklearn/neighbors/_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
# License: BSD 3 clause (C) INRIA, University of Amsterdam

import numpy as np
from scipy import stats
from ..utils.extmath import weighted_mode
from ..utils.extmath import _fast_mode
from ..utils.validation import _is_arraylike, _num_samples

import warnings
Expand Down Expand Up @@ -207,11 +206,7 @@ def predict(self, X):

y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

mode = _fast_mode(_y[neigh_ind, k], weights=weights, axis=1)
mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)

Expand Down
68 changes: 68 additions & 0 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,74 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
return U[:, :n_components], s[:n_components], Vt[:n_components, :]


def _fast_mode(x, weights=None, *, axis=1):
"""Returns a faster equivalent for scipy.mode

This is only implemented for positive integer data.

Parameters
----------
x : array_like, shape (n_samples, n_components)
n-dimensional array of which to find mode(s).
w : array_like, shape (n_samples, n_components)
n-dimensional array of weights for each value
axis : int, optional
Axis along which to operate. Default is 1.
Only axis=1 is supported.

Returns
-------
mode: ndarray, shape=(n_samples)
index of the mode

Examples
--------
>>> x = np.array([[0, 1, 1], [2, 0, 2]])
>>> _fast_mode(x, axis=1)
array([[1], [2]])

Next we illustrate weighted mode calculations

>>> x = np.array([[4, 1, 4, 2, 4, 2]])
>>> weights = np.array([[1, 1, 1, 1, 1, 1]])
>>> _fast_mode(x, weights)
array([[4]])

The value 4 appears three times: with uniform weights, the result is
simply the mode of the distribution.

>>> weights = np.array([[1, 3, 0.5, 1.5, 1, 2]]) # deweight the 4's
>>> _fast_mode(x, weights)
array([[2]])

The value 2 has the highest score: it appears twice with weights of
1.5 and 2: the sum of these is 3.5.

"""
if not hasattr(x, "__array__") or x.dtype.kind != 'i' or x.ndim != 2:
raise ValueError('_fast_mode is only implemented for 2D integer '
'arrays!')
if x.min() < 0:
raise ValueError('only positive data is supported.')

if weights is None:
data = np.ones(x.shape, dtype=np.int).ravel()
else:
if x.shape != weights.shape:
raise ValueError("x.shape {} != weights.shape {}"
.format(x.shape, weights.shape))
data = np.ascontiguousarray(weights).ravel()
indices = np.ascontiguousarray(x).ravel()
indptr = np.arange(x.shape[0]+1)*x.shape[1]
# we use the fact that data for repeated indices is summed when
# creating sparse arrays. The index with highest value is then the mode
if axis != 1:
raise ValueError('Only axis=1 is supported.')
z = sparse.csr_matrix((data, indices, indptr),
shape=(x.shape[0], x.max() + 1))
return np.asarray(np.argmax(z, axis=1))


@_deprecate_positional_args
def weighted_mode(a, w, *, axis=0):
"""Returns an array of the weighted modal (most common) value in a.
Expand Down
65 changes: 64 additions & 1 deletion sklearn/utils/tests/test_extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.extmath import softmax
from sklearn.utils.extmath import stable_cumsum
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.extmath import safe_sparse_dot, _fast_mode
from sklearn.datasets import make_low_rank_matrix


Expand Down Expand Up @@ -637,6 +637,69 @@ def test_stable_cumsum():
assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))


class TestFastMode():

def test_scipy_stats_axis_1(self):
rng = np.random.RandomState(0)

X = rng.randint(10, size=(100, 20))
mode_ref, _ = stats.mode(X, axis=1)
mode = _fast_mode(X, axis=1)
assert_array_equal(mode, mode_ref)

@pytest.mark.parametrize(
'x',
[np.ones((10, 10), dtype=np.float), 1, np.ones(5, dtype=np.int)],
ids=['array_float64', 'int', '1D-array'])
def test_input_validation(self, x):
with pytest.raises(ValueError,
match='only implemented for 2D integer arrays'):
_fast_mode(x)

def test_negative_values(self):
x = - np.ones((10, 10), dtype=np.int)
with pytest.raises(ValueError,
match="only positive data is supported"):
_fast_mode(x)

def test_ties(self):
# Check that ties are resolved in the same way as in stats.mode
X = np.ones((6, 9), dtype=np.int)
X[:, 3:] = 2
X[:, 6:] = 3
mode_ref, _ = stats.mode(X, axis=1)
mode = _fast_mode(X, axis=1)
assert_array_equal(mode, mode_ref)

def test_uniform_weights(self):
# with uniform weights, results should be identical to
# stats.mode
rng = np.random.RandomState(0)
x = rng.randint(10, size=(10, 5))
weights = np.ones(x.shape)

mode, _ = stats.mode(x, axis=1)
mode2 = _fast_mode(x, weights, axis=1)

assert_array_equal(mode, mode2)

def test_random_weights(self):
# set this up so that each row should have a weighted mode of 6,
# with a score that is easily reproduced
mode_result = 6

rng = np.random.RandomState(0)
x = rng.randint(mode_result, size=(100, 10))
w = rng.random_sample(x.shape)

x[:, :5] = mode_result
w[:, :5] += 1

mode = _fast_mode(x, w, axis=1)

assert_array_equal(mode, mode_result)


@pytest.mark.parametrize("A_array_constr", [np.array, sparse.csr_matrix],
ids=["dense", "sparse"])
@pytest.mark.parametrize("B_array_constr", [np.array, sparse.csr_matrix],
Expand Down