diff --git a/asv_benchmarks/benchmarks/neighbors.py b/asv_benchmarks/benchmarks/neighbors.py index b0bf6aba1d85b..a2bd1e7607dea 100644 --- a/asv_benchmarks/benchmarks/neighbors.py +++ b/asv_benchmarks/benchmarks/neighbors.py @@ -1,8 +1,8 @@ -from sklearn.neighbors import KNeighborsClassifier +from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsRegressor from .common import Benchmark, Estimator, Predictor -from .datasets import _20newsgroups_lowdim_dataset -from .utils import make_gen_classif_scorers +from .datasets import _20newsgroups_lowdim_dataset, _synth_regression_dataset +from .utils import make_gen_classif_scorers, make_gen_reg_scorers class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark): @@ -37,3 +37,41 @@ def make_estimator(self, params): def make_scorers(self): make_gen_classif_scorers(self) + + +class RadiusNeighborsRegressorBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for RadiusNeighborsRegressor + """ + + param_names = ["algorithm", "dimension", "n_jobs"] + params = ( + ["brute", "kd_tree", "ball_tree"], + ["very-low", "low", "high"], + Benchmark.n_jobs_vals, + ) + dim_to_number = {"very-low": 3, "low": 20, "high": 200} + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + algorithm, dimension, n_jobs = params + + n_features = self.dim_to_number[dimension] + + data = _synth_regression_dataset(n_samples=10000, n_features=n_features) + + return data + + def make_estimator(self, params): + algorithm, dimension, n_jobs = params + + estimator = RadiusNeighborsRegressor( + algorithm=algorithm, n_jobs=n_jobs, radius=0.05 + ) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 77a47bc9bc592..ef1c1a1ceccd7 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -13,6 +13,7 @@ import warnings import numpy as np +from scipy.sparse import csr_matrix from ._base import _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin @@ -466,25 +467,29 @@ def predict(self, X): if _y.ndim == 1: _y = _y.reshape((-1, 1)) - empty_obs = np.full_like(_y[0], np.nan) + # converting weights to sparse matrix + n_samples = len(neigh_ind) + n_neigh = np.asarray([len(ind) for ind in neigh_ind]) - if weights is None: - y_pred = np.array( - [ - np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs - for (i, ind) in enumerate(neigh_ind) - ] - ) + neigh_dst = np.concatenate(neigh_ind, axis=0) + neigh_src = np.repeat(np.arange(n_samples), repeats=n_neigh) + if weights is None: + weights = np.ones(len(neigh_src), dtype=bool) else: - y_pred = np.array( - [ - np.average(_y[ind, :], axis=0, weights=weights[i]) - if len(ind) - else empty_obs - for (i, ind) in enumerate(neigh_ind) - ] - ) + weights = np.concatenate(weights, axis=0) + + weights = csr_matrix( + (weights, (neigh_src, neigh_dst)), + shape=(n_samples, len(_y)), + ) + + # normalization factor so weights sum = 1 + norm_factor = weights.sum(axis=1) + y_pred = weights @ _y + with np.errstate(divide="ignore"): + # normalizing and assigning NaN to lonely samples + y_pred = np.where(norm_factor > 0, y_pred / norm_factor, np.nan) if np.any(np.isnan(y_pred)): empty_warning_msg = (