-
-
Notifications
You must be signed in to change notification settings - Fork 26k
EHN: RadiusNeighborRegressor speedup #24053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsRegressor | ||
|
||
from .common import Benchmark, Estimator, Predictor | ||
from .datasets import _20newsgroups_lowdim_dataset | ||
from .utils import make_gen_classif_scorers | ||
from .datasets import _20newsgroups_lowdim_dataset, _synth_regression_dataset | ||
from .utils import make_gen_classif_scorers, make_gen_reg_scorers | ||
|
||
|
||
class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark): | ||
|
@@ -37,3 +37,41 @@ def make_estimator(self, params): | |
|
||
def make_scorers(self): | ||
make_gen_classif_scorers(self) | ||
|
||
|
||
class RadiusNeighborsRegressorBenchmark(Predictor, Estimator, Benchmark): | ||
""" | ||
Benchmarks for RadiusNeighborsRegressor | ||
""" | ||
|
||
param_names = ["algorithm", "dimension", "n_jobs"] | ||
params = ( | ||
["brute", "kd_tree", "ball_tree"], | ||
["very-low", "low", "high"], | ||
Benchmark.n_jobs_vals, | ||
) | ||
dim_to_number = {"very-low": 3, "low": 20, "high": 200} | ||
|
||
def setup_cache(self): | ||
super().setup_cache() | ||
|
||
def make_data(self, params): | ||
algorithm, dimension, n_jobs = params | ||
|
||
n_features = self.dim_to_number[dimension] | ||
|
||
data = _synth_regression_dataset(n_samples=10000, n_features=n_features) | ||
|
||
return data | ||
|
||
def make_estimator(self, params): | ||
algorithm, dimension, n_jobs = params | ||
|
||
estimator = RadiusNeighborsRegressor( | ||
algorithm=algorithm, n_jobs=n_jobs, radius=0.05 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Due to the curse of dimensionality, the radius is notably small for |
||
) | ||
|
||
return estimator | ||
|
||
def make_scorers(self): | ||
make_gen_reg_scorers(self) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
import warnings | ||
|
||
import numpy as np | ||
from scipy.sparse import csr_matrix | ||
|
||
from ._base import _get_weights | ||
from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin | ||
|
@@ -466,25 +467,29 @@ def predict(self, X): | |
if _y.ndim == 1: | ||
_y = _y.reshape((-1, 1)) | ||
|
||
empty_obs = np.full_like(_y[0], np.nan) | ||
# converting weights to sparse matrix | ||
n_samples = len(neigh_ind) | ||
n_neigh = np.asarray([len(ind) for ind in neigh_ind]) | ||
|
||
if weights is None: | ||
y_pred = np.array( | ||
[ | ||
np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs | ||
for (i, ind) in enumerate(neigh_ind) | ||
] | ||
) | ||
neigh_dst = np.concatenate(neigh_ind, axis=0) | ||
neigh_src = np.repeat(np.arange(n_samples), repeats=n_neigh) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This matrix might have a significant memory footprint. |
||
|
||
if weights is None: | ||
weights = np.ones(len(neigh_src), dtype=bool) | ||
else: | ||
y_pred = np.array( | ||
[ | ||
np.average(_y[ind, :], axis=0, weights=weights[i]) | ||
if len(ind) | ||
else empty_obs | ||
for (i, ind) in enumerate(neigh_ind) | ||
] | ||
) | ||
weights = np.concatenate(weights, axis=0) | ||
|
||
weights = csr_matrix( | ||
(weights, (neigh_src, neigh_dst)), | ||
shape=(n_samples, len(_y)), | ||
) | ||
|
||
# normalization factor so weights sum = 1 | ||
norm_factor = weights.sum(axis=1) | ||
y_pred = weights @ _y | ||
with np.errstate(divide="ignore"): | ||
# normalizing and assigning NaN to lonely samples | ||
y_pred = np.where(norm_factor > 0, y_pred / norm_factor, np.nan) | ||
|
||
if np.any(np.isnan(y_pred)): | ||
empty_warning_msg = ( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A similar improvement have been tried in #23721, but suffered for memory overconsumption as
n_samples
got larger (see #23721 (review)).Is it possible to report ASV results for
n_samples=int(1e6)
or evenn_samples=int(1e7)
?