Thanks to visit codestin.com
Credit goes to github.com

Skip to content

EHN: RadiusNeighborRegressor speedup #24053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions asv_benchmarks/benchmarks/neighbors.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsRegressor

from .common import Benchmark, Estimator, Predictor
from .datasets import _20newsgroups_lowdim_dataset
from .utils import make_gen_classif_scorers
from .datasets import _20newsgroups_lowdim_dataset, _synth_regression_dataset
from .utils import make_gen_classif_scorers, make_gen_reg_scorers


class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark):
Expand Down Expand Up @@ -37,3 +37,41 @@ def make_estimator(self, params):

def make_scorers(self):
make_gen_classif_scorers(self)


class RadiusNeighborsRegressorBenchmark(Predictor, Estimator, Benchmark):
"""
Benchmarks for RadiusNeighborsRegressor
"""

param_names = ["algorithm", "dimension", "n_jobs"]
params = (
["brute", "kd_tree", "ball_tree"],
["very-low", "low", "high"],
Benchmark.n_jobs_vals,
)
dim_to_number = {"very-low": 3, "low": 20, "high": 200}

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
algorithm, dimension, n_jobs = params

n_features = self.dim_to_number[dimension]

data = _synth_regression_dataset(n_samples=10000, n_features=n_features)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A similar improvement have been tried in #23721, but suffered for memory overconsumption as n_samples got larger (see #23721 (review)).

Is it possible to report ASV results for n_samples=int(1e6) or even n_samples=int(1e7)?


return data

def make_estimator(self, params):
algorithm, dimension, n_jobs = params

estimator = RadiusNeighborsRegressor(
algorithm=algorithm, n_jobs=n_jobs, radius=0.05
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Due to the curse of dimensionality, the radius is notably small for dim_to_number >= 20. Is it possible to scale it by log(dim_to_number) or to define it as the 10th percentile of the observed distances?

)

return estimator

def make_scorers(self):
make_gen_reg_scorers(self)
37 changes: 21 additions & 16 deletions sklearn/neighbors/_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import warnings

import numpy as np
from scipy.sparse import csr_matrix

from ._base import _get_weights
from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
Expand Down Expand Up @@ -466,25 +467,29 @@ def predict(self, X):
if _y.ndim == 1:
_y = _y.reshape((-1, 1))

empty_obs = np.full_like(_y[0], np.nan)
# converting weights to sparse matrix
n_samples = len(neigh_ind)
n_neigh = np.asarray([len(ind) for ind in neigh_ind])

if weights is None:
y_pred = np.array(
[
np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs
for (i, ind) in enumerate(neigh_ind)
]
)
neigh_dst = np.concatenate(neigh_ind, axis=0)
neigh_src = np.repeat(np.arange(n_samples), repeats=n_neigh)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This matrix might have a significant memory footprint.


if weights is None:
weights = np.ones(len(neigh_src), dtype=bool)
else:
y_pred = np.array(
[
np.average(_y[ind, :], axis=0, weights=weights[i])
if len(ind)
else empty_obs
for (i, ind) in enumerate(neigh_ind)
]
)
weights = np.concatenate(weights, axis=0)

weights = csr_matrix(
(weights, (neigh_src, neigh_dst)),
shape=(n_samples, len(_y)),
)

# normalization factor so weights sum = 1
norm_factor = weights.sum(axis=1)
y_pred = weights @ _y
with np.errstate(divide="ignore"):
# normalizing and assigning NaN to lonely samples
y_pred = np.where(norm_factor > 0, y_pred / norm_factor, np.nan)

if np.any(np.isnan(y_pred)):
empty_warning_msg = (
Expand Down