From 859b40e1185635b00f1b97dc15faecc42be07357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Sat, 4 Jan 2025 12:07:26 +0100 Subject: [PATCH 1/3] TST Fix doctest due to floating point difference in numpy --- doc/common_pitfalls.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst index c16385943f9ad..c356f3f5982f3 100644 --- a/doc/common_pitfalls.rst +++ b/doc/common_pitfalls.rst @@ -225,7 +225,7 @@ method is used during fitting and predicting:: >>> from sklearn.model_selection import cross_val_score >>> scores = cross_val_score(pipeline, X, y) >>> print(f"Mean accuracy: {scores.mean():.2f}+/-{scores.std():.2f}") - Mean accuracy: 0.46+/-0.07 + Mean accuracy: 0.4...+/-0.07 .. _randomness: From ea03721a7ba668b8c6d0b31a1c47e716121fe24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Sat, 4 Jan 2025 12:07:55 +0100 Subject: [PATCH 2/3] [azure parallel] [free-threaded] [scipy-dev] From c043775ea68ed36e8da3e4c6a57900014618a665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 6 Jan 2025 06:43:38 +0100 Subject: [PATCH 3/3] [scipy-dev] [azure parallel] use HistGradientBoostingClassifier which seems more stable for some reason --- doc/common_pitfalls.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst index c356f3f5982f3..63d2893cec479 100644 --- a/doc/common_pitfalls.rst +++ b/doc/common_pitfalls.rst @@ -160,7 +160,7 @@ much higher than expected accuracy score:: >>> from sklearn.model_selection import train_test_split >>> from sklearn.feature_selection import SelectKBest - >>> from sklearn.ensemble import GradientBoostingClassifier + >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> from sklearn.metrics import accuracy_score >>> # Incorrect preprocessing: the entire data is transformed @@ -168,9 +168,9 @@ much higher than expected accuracy score:: >>> X_train, X_test, y_train, y_test = train_test_split( ... X_selected, y, random_state=42) - >>> gbc = GradientBoostingClassifier(random_state=1) + >>> gbc = HistGradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train, y_train) - GradientBoostingClassifier(random_state=1) + HistGradientBoostingClassifier(random_state=1) >>> y_pred = gbc.predict(X_test) >>> accuracy_score(y_test, y_pred) @@ -189,14 +189,14 @@ data, close to chance:: >>> select = SelectKBest(k=25) >>> X_train_selected = select.fit_transform(X_train, y_train) - >>> gbc = GradientBoostingClassifier(random_state=1) + >>> gbc = HistGradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train_selected, y_train) - GradientBoostingClassifier(random_state=1) + HistGradientBoostingClassifier(random_state=1) >>> X_test_selected = select.transform(X_test) >>> y_pred = gbc.predict(X_test_selected) >>> accuracy_score(y_test, y_pred) - 0.46 + 0.5 Here again, we recommend using a :class:`~sklearn.pipeline.Pipeline` to chain together the feature selection and model estimators. The pipeline ensures @@ -207,15 +207,15 @@ is used only for calculating the accuracy score:: >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, random_state=42) >>> pipeline = make_pipeline(SelectKBest(k=25), - ... GradientBoostingClassifier(random_state=1)) + ... HistGradientBoostingClassifier(random_state=1)) >>> pipeline.fit(X_train, y_train) Pipeline(steps=[('selectkbest', SelectKBest(k=25)), - ('gradientboostingclassifier', - GradientBoostingClassifier(random_state=1))]) + ('histgradientboostingclassifier', + HistGradientBoostingClassifier(random_state=1))]) >>> y_pred = pipeline.predict(X_test) >>> accuracy_score(y_test, y_pred) - 0.46 + 0.5 The pipeline can also be fed into a cross-validation function such as :func:`~sklearn.model_selection.cross_val_score`. @@ -225,7 +225,7 @@ method is used during fitting and predicting:: >>> from sklearn.model_selection import cross_val_score >>> scores = cross_val_score(pipeline, X, y) >>> print(f"Mean accuracy: {scores.mean():.2f}+/-{scores.std():.2f}") - Mean accuracy: 0.4...+/-0.07 + Mean accuracy: 0.43+/-0.05 .. _randomness: