-
-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Open
Labels
Description
First reported in: scikit-learn-contrib/imbalanced-learn#1056
We have a regression in kneighbors
with sparse matrix from 1.1.X to 1.3.X.
A code sample to reproduce:
# %%
import sklearn
sklearn.__version__
# %%
import numpy as np
from scipy import sparse
from sklearn.neighbors import KNeighborsRegressor
n_samples, n_features = 1_000, 10_000
X = sparse.random(n_samples, n_features, density=0.01, format="csr", random_state=0)
rng = np.random.default_rng(0)
y = rng.integers(0, 2, size=n_samples)
knn = KNeighborsRegressor(n_neighbors=5).fit(X, y)
# %%
%%timeit
knn.kneighbors(X, return_distance=False)
1.1.X
21.5 ms ± 217 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
main
1.16 s ± 9.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Small benchmark
# %%
import sklearn
sklearn.__version__
# %%
import time
from collections import defaultdict
import numpy as np
from scipy import sparse
from sklearn.neighbors import KNeighborsRegressor
n_samples, n_features = 1_000, [500, 1_000, 2_500, 5_000, 7_500, 10_000, 25_000, 50_000]
results = defaultdict(list)
for nf in n_features:
X = sparse.random(n_samples, nf, density=0.01, format="csr", random_state=0)
rng = np.random.default_rng(0)
y = rng.integers(0, 2, size=n_samples)
knn = KNeighborsRegressor(n_neighbors=5).fit(X, y)
start = time.time()
knn.kneighbors(X, return_distance=False)
elapsed_time = time.time() - start
results["version"].append(sklearn.__version__)
results["n_features"].append(nf)
results["elapsed_time"].append(elapsed_time)
# %%
import pandas as pd
results = pd.DataFrame(results)
results.to_csv(f"bench_{sklearn.__version__}.csv", index=False)
# %%
import pandas as pd
results_main = pd.read_csv(f"bench_1.5.dev0.csv")
results_1_1 = pd.read_csv(f"bench_1.1.3.csv")
results = pd.concat([results_main, results_1_1], axis=0)
results
# %%
import seaborn as sns
sns.set_context("talk")
# %%
sns.relplot(
data=results,
x="n_features",
y="elapsed_time",
hue="version",
kind="line",
height=6,
aspect=1.5,
legend=True,
)
sns.despine(offset=10, trim=True)
Debug profiling
My first investigation look like we are spending all our time in the following function:
scikit-learn/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp
Lines 39 to 69 in 5c7e831
cdef void _middle_term_sparse_sparse_64( | |
const float64_t[:] X_data, | |
const int32_t[:] X_indices, | |
const int32_t[:] X_indptr, | |
intp_t X_start, | |
intp_t X_end, | |
const float64_t[:] Y_data, | |
const int32_t[:] Y_indices, | |
const int32_t[:] Y_indptr, | |
intp_t Y_start, | |
intp_t Y_end, | |
float64_t * D, | |
) noexcept nogil: | |
# This routine assumes that D points to the first element of a | |
# zeroed buffer of length at least equal to n_X × n_Y, conceptually | |
# representing a 2-d C-ordered array. | |
cdef: | |
intp_t i, j, k | |
intp_t n_X = X_end - X_start | |
intp_t n_Y = Y_end - Y_start | |
intp_t x_col, x_ptr, y_col, y_ptr | |
for i in range(n_X): | |
for x_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): | |
x_col = X_indices[x_ptr] | |
for j in range(n_Y): | |
k = i * n_Y + j | |
for y_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): | |
y_col = Y_indices[y_ptr] | |
if x_col == y_col: | |
D[k] += -2 * X_data[x_ptr] * Y_data[y_ptr] |
But I'm a bit rusty with profiling native code. I need a bit more time to investigate.
jjerphan