From a47ebe2e479d3cb6e247b046d6a0954e77453729 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 17:13:54 +0100 Subject: [PATCH 01/31] BUG ensure that parallel/sequential provide the same results --- sklearn/inspection/_permutation_importance.py | 9 ++++---- .../tests/test_permutation_importance.py | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d71d5fd3f3a68..015ae10af5711 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -28,6 +28,7 @@ def _safe_column_indexing(X, col_idx): def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, n_repeats, scorer): """Calculate score when `col_idx` is permuted.""" + random_state = check_random_state(random_state) original_feature = _safe_column_indexing(X, col_idx).copy() temp = original_feature.copy() @@ -110,15 +111,15 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, X = check_array(X, force_all_finite='allow-nan', dtype=np.object, copy=True) + MAX_RAND_SEED = np.iinfo(np.int32).max random_state = check_random_state(random_state) + many_random_state = random_state.randint(MAX_RAND_SEED, size=X.shape[0]) scorer = check_scoring(estimator, scoring=scoring) - baseline_score = scorer(estimator, X, y) - scores = np.zeros((X.shape[1], n_repeats)) scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)( - estimator, X, y, col_idx, random_state, n_repeats, scorer - ) for col_idx in range(X.shape[1])) + estimator, X, y, col_idx, rand_int, n_repeats, scorer + ) for rand_int, col_idx in zip(many_random_state, range(X.shape[1]))) importances = baseline_score - np.array(scores) return Bunch(importances_mean=np.mean(importances, axis=1), diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 671a1e11b1fec..49b574e51e4be 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -150,3 +150,24 @@ def test_permutation_importance_linear_regresssion(): scoring='neg_mean_squared_error') assert_allclose(expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6) + + +def test_permutation_importance_equivalence_sequential_paralell(): + X, y = make_regression(n_samples=500, n_features=10, random_state=0) + + X = scale(X) + y = scale(y) + + lr = LinearRegression().fit(X, y) + + importance_parallel = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) + importance_sequential = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=1 + ) + + assert_allclose( + importance_parallel['importances'], + importance_sequential['importances'] + ) From ab95fd53fe1668bfd04a973e286bda6813550c79 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 17:26:34 +0100 Subject: [PATCH 02/31] iter --- sklearn/inspection/_permutation_importance.py | 12 +++++++++--- .../inspection/tests/test_permutation_importance.py | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 015ae10af5711..7e326ce61dab7 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -111,15 +111,21 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, X = check_array(X, force_all_finite='allow-nan', dtype=np.object, copy=True) + # Precompute random seed from the random state to be used + # to get a fresh independent RandomState instance for each + # parallel call to _calculate_permutation_scores, irrespective of + # the fact that variables are shared or not depending on the active + # joblib backend (sequential, thread-based or process-based). MAX_RAND_SEED = np.iinfo(np.int32).max random_state = check_random_state(random_state) - many_random_state = random_state.randint(MAX_RAND_SEED, size=X.shape[0]) + random_seed = random_state.randint(0, MAX_RAND_SEED) + scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)( - estimator, X, y, col_idx, rand_int, n_repeats, scorer - ) for rand_int, col_idx in zip(many_random_state, range(X.shape[1]))) + estimator, X, y, col_idx, random_seed, n_repeats, scorer + ) for col_idx in range(X.shape[1])) importances = baseline_score - np.array(scores) return Bunch(importances_mean=np.mean(importances, axis=1), diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 49b574e51e4be..b67598c82e3d9 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -153,6 +153,8 @@ def test_permutation_importance_linear_regresssion(): def test_permutation_importance_equivalence_sequential_paralell(): + # regression test to make sure that sequential and parallel calls will + # output the same results. X, y = make_regression(n_samples=500, n_features=10, random_state=0) X = scale(X) From d58272da3f33471fa20134c9e136c06d74ef6faa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 17:30:00 +0100 Subject: [PATCH 03/31] whats new --- doc/whats_new/v0.22.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 19a8327783b20..d1e18e23f70b5 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -15,6 +15,14 @@ This is a bug-fix release to primarily resolve some packaging issues in version Changelog --------- +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`inspection.permutation_importance` will return the same + `importances` when a `random_state` is given for both `n_jobs=1` or + `n_jobs>1`. + :pr:`15993` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.metrics` ...................... From 7211a53d1c2eefc408905b2e9222e7f832dec0da Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 18:24:39 +0100 Subject: [PATCH 04/31] Check that the test is not trivial --- sklearn/inspection/tests/test_permutation_importance.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index b67598c82e3d9..a3bcf5c410b38 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -169,6 +169,13 @@ def test_permutation_importance_equivalence_sequential_paralell(): lr, X, y, n_repeats=5, random_state=0, n_jobs=1 ) + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_sequential['importances'].min() + imp_max = importance_sequential['importances'].max() + assert imp_max - imp_min > 0.3 + + # The actually check that parallelism does not impact the results: assert_allclose( importance_parallel['importances'], importance_sequential['importances'] From 70e1ef433ff955130d88c31a51557231772f3c9e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 18:40:39 +0100 Subject: [PATCH 05/31] Typo in PR number --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index d1e18e23f70b5..b9e926bf8d702 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -21,7 +21,7 @@ Changelog - |Fix| :func:`inspection.permutation_importance` will return the same `importances` when a `random_state` is given for both `n_jobs=1` or `n_jobs>1`. - :pr:`15993` by :user:`Guillaume Lemaitre `. + :pr:`15933` by :user:`Guillaume Lemaitre `. :mod:`sklearn.metrics` ...................... From a6909ca6fe95af6183552975e2657b35045a7ce6 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 19:26:55 +0100 Subject: [PATCH 06/31] Fix thread-safety issue --- sklearn/inspection/_permutation_importance.py | 22 ++++++++--------- .../tests/test_permutation_importance.py | 24 +++++++++++++++---- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 7e326ce61dab7..d8475ed5ebf7a 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -29,17 +29,18 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, n_repeats, scorer): """Calculate score when `col_idx` is permuted.""" random_state = check_random_state(random_state) - original_feature = _safe_column_indexing(X, col_idx).copy() - temp = original_feature.copy() + # Work on a copy of X to to ensure thread-safety in case of threading + # based parallelism: + X_permuted = X.copy() + column_data = _safe_column_indexing(X_permuted, col_idx) scores = np.zeros(n_repeats) for n_round in range(n_repeats): - random_state.shuffle(temp) - _safe_column_setting(X, col_idx, temp) - feature_score = scorer(estimator, X, y) + random_state.shuffle(column_data) + _safe_column_setting(X_permuted, col_idx, column_data) + feature_score = scorer(estimator, X_permuted, y) scores[n_round] = feature_score - _safe_column_setting(X, col_idx, original_feature) return scores @@ -105,18 +106,15 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. https://doi.org/10.1023/A:1010933404324 """ - if hasattr(X, "iloc"): - X = X.copy() # Dataframe - else: - X = check_array(X, force_all_finite='allow-nan', dtype=np.object, - copy=True) + if not hasattr(X, "iloc"): + X = check_array(X, force_all_finite='allow-nan', dtype=None) # Precompute random seed from the random state to be used # to get a fresh independent RandomState instance for each # parallel call to _calculate_permutation_scores, irrespective of # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). - MAX_RAND_SEED = np.iinfo(np.int32).max + MAX_RAND_SEED = np.iinfo(np.uint32).max random_state = check_random_state(random_state) random_seed = random_state.randint(0, MAX_RAND_SEED) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index a3bcf5c410b38..66a41407e6919 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -17,6 +17,8 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale +from sklearn.utils import parallel_backend + @pytest.mark.parametrize("n_jobs", [1, 2]) def test_permutation_importance_correlated_feature_regression(n_jobs): @@ -162,9 +164,6 @@ def test_permutation_importance_equivalence_sequential_paralell(): lr = LinearRegression().fit(X, y) - importance_parallel = permutation_importance( - lr, X, y, n_repeats=5, random_state=0, n_jobs=2 - ) importance_sequential = permutation_importance( lr, X, y, n_repeats=5, random_state=0, n_jobs=1 ) @@ -175,8 +174,23 @@ def test_permutation_importance_equivalence_sequential_paralell(): imp_max = importance_sequential['importances'].max() assert imp_max - imp_min > 0.3 - # The actually check that parallelism does not impact the results: + # The actually check that parallelism does not impact the results + # either with shared memory (threading) or without isolated memory + # via process-based parallelism using loky: + with parallel_backend("threading"): + importance_threading = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) + assert_allclose( + importance_threading['importances'], + importance_sequential['importances'] + ) + + with parallel_backend("loky"): + importance_loky = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) assert_allclose( - importance_parallel['importances'], + importance_loky['importances'], importance_sequential['importances'] ) From a24b9d55286acf5047a7fd441823cb0232442581 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 19:37:51 +0100 Subject: [PATCH 07/31] Add non-regression test to check that issue 15810 is fixed. --- doc/whats_new/v0.22.rst | 7 +++-- .../tests/test_permutation_importance.py | 29 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index b9e926bf8d702..418680a6808cf 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -20,8 +20,11 @@ Changelog - |Fix| :func:`inspection.permutation_importance` will return the same `importances` when a `random_state` is given for both `n_jobs=1` or - `n_jobs>1`. - :pr:`15933` by :user:`Guillaume Lemaitre `. + `n_jobs>1` both with shared memory backends (thread-safety) and + isolated memory, process-based backends. + Also avoid casting the data as object dtype and avoid read-only error + on large dataframes with `n_jobs>1` as reported in :issue:`15810`. + :pr:`15933` by :user:`Guillaume Lemaitre ` and `Olivier Grisel`_. :mod:`sklearn.metrics` ...................... diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 66a41407e6919..bc7f2ca75c5e1 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -6,7 +6,9 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import load_boston from sklearn.datasets import load_iris +from sklearn.datasets import make_classification from sklearn.datasets import make_regression +from sklearn.dummy import DummyClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression @@ -194,3 +196,30 @@ def test_permutation_importance_equivalence_sequential_paralell(): importance_loky['importances'], importance_sequential['importances'] ) + + +@pytest.mark.parametrize("input_type", ["array", "dataframe"]) +def test_permutation_importance_large_memmaped_data(input_type): + # Smoke, non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15810 + n_samples, n_features = int(5e4), 4 + X, y = make_classification(n_samples=n_samples, n_features=n_features, + random_state=0) + assert X.nbytes > 1e6 # trigger joblib memmaping + + if input_type == "dataframe": + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X) + else: + assert input_type == "array" + + clf = DummyClassifier(strategy='prior').fit(X, y) + + # Actual smoke test: should not raise any error: + n_repeats = 5 + r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2) + + # Auxiliary check: dummy classifier is feature indpendent: + # permutating feature should not change the predictions + expected_importances = np.zeros((n_features, n_repeats)) + assert_allclose(expected_importances, r.importances) From 7ad0b26341fe6e29d12271aadb3a4856cb807d4c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 19:50:27 +0100 Subject: [PATCH 08/31] Leaner test --- sklearn/inspection/tests/test_permutation_importance.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index bc7f2ca75c5e1..27ca369a6d138 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -160,10 +160,6 @@ def test_permutation_importance_equivalence_sequential_paralell(): # regression test to make sure that sequential and parallel calls will # output the same results. X, y = make_regression(n_samples=500, n_features=10, random_state=0) - - X = scale(X) - y = scale(y) - lr = LinearRegression().fit(X, y) importance_sequential = permutation_importance( From fb69870852627844056b91910e639bd6d73d72d4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 19 Dec 2019 20:12:55 +0100 Subject: [PATCH 09/31] Support joblib 0.11 --- .../tests/test_permutation_importance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 27ca369a6d138..892ebc25a4a0a 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -174,22 +174,24 @@ def test_permutation_importance_equivalence_sequential_paralell(): # The actually check that parallelism does not impact the results # either with shared memory (threading) or without isolated memory - # via process-based parallelism using loky: - with parallel_backend("threading"): - importance_threading = permutation_importance( - lr, X, y, n_repeats=5, random_state=0, n_jobs=2 - ) + # via process-based parallelism using the default backend + # ('loky' or 'multiprocessing') depending on the joblib version: + + # process-based parallelism (by default): + importance_processes = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 assert_allclose( - importance_threading['importances'], + importance_processes['importances'], importance_sequential['importances'] ) - with parallel_backend("loky"): - importance_loky = permutation_importance( + # thread-based parallelism: + with parallel_backend("threading"): + importance_threading = permutation_importance( lr, X, y, n_repeats=5, random_state=0, n_jobs=2 ) assert_allclose( - importance_loky['importances'], + importance_threading['importances'], importance_sequential['importances'] ) @@ -215,7 +217,7 @@ def test_permutation_importance_large_memmaped_data(input_type): n_repeats = 5 r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2) - # Auxiliary check: dummy classifier is feature indpendent: + # Auxiliary check: DummyClassifier is feature independent: # permutating feature should not change the predictions expected_importances = np.zeros((n_features, n_repeats)) assert_allclose(expected_importances, r.importances) From f80dc6946798a8f25cdc348ff61d551ef574da2b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 19 Dec 2019 15:08:07 -0500 Subject: [PATCH 10/31] BUG Syntax error --- sklearn/inspection/tests/test_permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 892ebc25a4a0a..2435c65121a17 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -179,7 +179,7 @@ def test_permutation_importance_equivalence_sequential_paralell(): # process-based parallelism (by default): importance_processes = permutation_importance( - lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + lr, X, y, n_repeats=5, random_state=0, n_jobs=2) assert_allclose( importance_processes['importances'], importance_sequential['importances'] From 775e98698ed62045893782894788403aa6b4c1d2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 21:40:44 +0100 Subject: [PATCH 11/31] MAX_RAND_SEED should be int32 --- sklearn/inspection/_permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d8475ed5ebf7a..e9849926b1deb 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -114,7 +114,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # parallel call to _calculate_permutation_scores, irrespective of # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). - MAX_RAND_SEED = np.iinfo(np.uint32).max + MAX_RAND_SEED = np.iinfo(np.int32).max random_state = check_random_state(random_state) random_seed = random_state.randint(0, MAX_RAND_SEED) From 063129921d4074fb97a687ebb65f8ef6542e8a73 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 22:18:59 +0100 Subject: [PATCH 12/31] cosmetic --- sklearn/inspection/_permutation_importance.py | 26 +++++-------------- .../tests/test_permutation_importance.py | 10 +++---- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index e9849926b1deb..8c5c197548fc5 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -4,25 +4,10 @@ from joblib import delayed from ..metrics import check_scoring +from ..utils import Bunch from ..utils import check_random_state from ..utils import check_array -from ..utils import Bunch - - -def _safe_column_setting(X, col_idx, values): - """Set column on X using `col_idx`""" - if hasattr(X, "iloc"): - X.iloc[:, col_idx] = values - else: - X[:, col_idx] = values - - -def _safe_column_indexing(X, col_idx): - """Return column from X using `col_idx`""" - if hasattr(X, "iloc"): - return X.iloc[:, col_idx].values - else: - return X[:, col_idx] +from ..utils import _safe_indexing def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, @@ -33,11 +18,14 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, # Work on a copy of X to to ensure thread-safety in case of threading # based parallelism: X_permuted = X.copy() - column_data = _safe_column_indexing(X_permuted, col_idx) + column_data = np.asarray(_safe_indexing(X_permuted, col_idx, axis=1)) scores = np.zeros(n_repeats) for n_round in range(n_repeats): random_state.shuffle(column_data) - _safe_column_setting(X_permuted, col_idx, column_data) + if hasattr(X_permuted, "iloc"): + X_permuted.iloc[:, col_idx] = column_data + else: + X_permuted[:, col_idx] = column_data feature_score = scorer(estimator, X_permuted, y) scores[n_round] = feature_score diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 2435c65121a17..a27941c0a5625 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -20,6 +20,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale from sklearn.utils import parallel_backend +from sklearn.utils._testing import _convert_container @pytest.mark.parametrize("n_jobs", [1, 2]) @@ -156,7 +157,7 @@ def test_permutation_importance_linear_regresssion(): rtol=1e-1, atol=1e-6) -def test_permutation_importance_equivalence_sequential_paralell(): +def test_permutation_importance_equivalence_sequential_parallel(): # regression test to make sure that sequential and parallel calls will # output the same results. X, y = make_regression(n_samples=500, n_features=10, random_state=0) @@ -205,12 +206,7 @@ def test_permutation_importance_large_memmaped_data(input_type): random_state=0) assert X.nbytes > 1e6 # trigger joblib memmaping - if input_type == "dataframe": - pd = pytest.importorskip("pandas") - X = pd.DataFrame(X) - else: - assert input_type == "array" - + X = _convert_container(X, input_type) clf = DummyClassifier(strategy='prior').fit(X, y) # Actual smoke test: should not raise any error: From 1a21a982a6c21e423f4d91a55c4f24a6b2be134a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 23:13:51 +0100 Subject: [PATCH 13/31] inplace operation --- sklearn/inspection/_permutation_importance.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 8c5c197548fc5..5054279d5e15c 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -18,14 +18,13 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, # Work on a copy of X to to ensure thread-safety in case of threading # based parallelism: X_permuted = X.copy() - column_data = np.asarray(_safe_indexing(X_permuted, col_idx, axis=1)) + # Ensure to take a view on a column of X_permuted to make shuffling inplace + column_data = _safe_indexing(X_permuted, col_idx, axis=1) + if hasattr(X_permuted, "iloc"): + column_data = column_data.values scores = np.zeros(n_repeats) for n_round in range(n_repeats): random_state.shuffle(column_data) - if hasattr(X_permuted, "iloc"): - X_permuted.iloc[:, col_idx] = column_data - else: - X_permuted[:, col_idx] = column_data feature_score = scorer(estimator, X_permuted, y) scores[n_round] = feature_score From 023eca27472170e3c67341f01137a314839f46eb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 23:17:27 +0100 Subject: [PATCH 14/31] cosmit --- sklearn/inspection/_permutation_importance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 5054279d5e15c..82abf1960d0c2 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -20,8 +20,9 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, X_permuted = X.copy() # Ensure to take a view on a column of X_permuted to make shuffling inplace column_data = _safe_indexing(X_permuted, col_idx, axis=1) - if hasattr(X_permuted, "iloc"): - column_data = column_data.values + column_data = getattr(column_data, "values", column_data) + # if hasattr(X_permuted, "iloc"): + # column_data = column_data.values scores = np.zeros(n_repeats) for n_round in range(n_repeats): random_state.shuffle(column_data) From e21323627c03d76a25f1307ff3cb9ccd38fcc111 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 19 Dec 2019 23:18:48 +0100 Subject: [PATCH 15/31] cosmit --- sklearn/inspection/_permutation_importance.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 82abf1960d0c2..d959267132744 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -21,8 +21,6 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, # Ensure to take a view on a column of X_permuted to make shuffling inplace column_data = _safe_indexing(X_permuted, col_idx, axis=1) column_data = getattr(column_data, "values", column_data) - # if hasattr(X_permuted, "iloc"): - # column_data = column_data.values scores = np.zeros(n_repeats) for n_round in range(n_repeats): random_state.shuffle(column_data) From be8f1c11dceb550605850e7b1406b5fe5701adaf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 01:19:28 +0100 Subject: [PATCH 16/31] Better comment explaining the need for X.copy() --- sklearn/inspection/_permutation_importance.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d959267132744..3c7f959577bf9 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -15,8 +15,12 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, """Calculate score when `col_idx` is permuted.""" random_state = check_random_state(random_state) - # Work on a copy of X to to ensure thread-safety in case of threading - # based parallelism: + # Work on a copy of X to to ensure thread-safety in case of threading based + # parallelism. Furthermore, making a copy is also useful when the joblib + # backend is 'loky' (default) or the old 'multiprocessing': in those cases, + # if X is large it will be automatically be backed by a readonly memory map + # (memmap). X.copy() on the other hand is always guaranteed to return a + # writable data-structure whose columns can be shuffled inplace. X_permuted = X.copy() # Ensure to take a view on a column of X_permuted to make shuffling inplace column_data = _safe_indexing(X_permuted, col_idx, axis=1) From 910ef4f6bb5dbf32e8f287b6de8d6499d4498dba Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 09:29:03 +0100 Subject: [PATCH 17/31] Fix random seed range --- sklearn/inspection/_permutation_importance.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 3c7f959577bf9..f9bab27fca5e6 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -104,9 +104,8 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # parallel call to _calculate_permutation_scores, irrespective of # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). - MAX_RAND_SEED = np.iinfo(np.int32).max random_state = check_random_state(random_state) - random_seed = random_state.randint(0, MAX_RAND_SEED) + random_seed = random_state.randint(0, 2 ** 32) scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) From 723bf03313b4c1380cf91d92c52832db1466d56b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 09:44:34 +0100 Subject: [PATCH 18/31] Test exact equivalence in column shuffling of pandas dataframes with numpy arrays --- .../tests/test_permutation_importance.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index a27941c0a5625..d29a0c1e10fcd 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -197,6 +197,38 @@ def test_permutation_importance_equivalence_sequential_parallel(): ) +@pytest.mark.parametrize("n_jobs", [None, 1, 2]) +def test_permutation_importance_equivalence_array_dataframe(n_jobs): + # This test checks that the column shuffling logic has the same behavior + # both a dataframe and a simple numpy array. + pd = pytest.importorskip('pandas') + + # regression test to make sure that sequential and parallel calls will + # output the same results. + X, y = make_regression(n_samples=500, n_features=10, random_state=0) + X_df = pd.DataFrame(X) + + lr = LinearRegression().fit(X, y) + + importance_array = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=n_jobs + ) + + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_array['importances'].min() + imp_max = importance_array['importances'].max() + assert imp_max - imp_min > 0.3 + + importance_dataframe = permutation_importance( + lr, X_df, y, n_repeats=5, random_state=0, n_jobs=n_jobs + ) + assert_allclose( + importance_array['importances'], + importance_dataframe['importances'] + ) + + @pytest.mark.parametrize("input_type", ["array", "dataframe"]) def test_permutation_importance_large_memmaped_data(input_type): # Smoke, non-regression test for: From f5bda8ce6c2b71b675613c1220c1f9d8daf07704 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:02:42 +0100 Subject: [PATCH 19/31] Add acknowledgment to 15898 --- doc/whats_new/v0.22.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 418680a6808cf..69fb98a651225 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -24,6 +24,7 @@ Changelog isolated memory, process-based backends. Also avoid casting the data as object dtype and avoid read-only error on large dataframes with `n_jobs>1` as reported in :issue:`15810`. + Follow-up of :pr:`15898` by :user:`Shivam Gargsya `. :pr:`15933` by :user:`Guillaume Lemaitre ` and `Olivier Grisel`_. :mod:`sklearn.metrics` From e9770cf82387f74da543403cd52927f795b0de62 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:05:21 +0100 Subject: [PATCH 20/31] factorize max_int_32 --- sklearn/ensemble/_base.py | 5 ++--- sklearn/feature_extraction/text.py | 3 ++- sklearn/inspection/_permutation_importance.py | 3 ++- sklearn/tree/_classes.py | 3 ++- sklearn/utils/__init__.py | 2 ++ 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 5db30b9bbc600..2b097828f59b2 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -16,11 +16,10 @@ from ..base import BaseEstimator from ..base import MetaEstimatorMixin from ..utils import Bunch +from ..utils import MAX_INT_32 from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition -MAX_RAND_SEED = np.iinfo(np.int32).max - def _parallel_fit_estimator(estimator, X, y, sample_weight=None): """Private function used to fit an estimator within a job.""" @@ -71,7 +70,7 @@ def _set_random_states(estimator, random_state=None): to_set = {} for key in sorted(estimator.get_params(deep=True)): if key == 'random_state' or key.endswith('__random_state'): - to_set[key] = random_state.randint(MAX_RAND_SEED) + to_set[key] = random_state.randint(MAX_INT_32) if to_set: estimator.set_params(**to_set) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2d8f7d840c55b..9f9c05ad7fd90 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,6 +30,7 @@ from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES +from ..utils import MAX_INT_32 from ..utils import _IS_32BIT, deprecated from ..utils.fixes import _astype_copy_false from ..exceptions import ChangedBehaviorWarning, NotFittedError @@ -1150,7 +1151,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - if indptr[-1] > 2147483648: # = 2**31 - 1 + if indptr[-1] > MAX_INT_32: # = 2**31 - 1 if _IS_32BIT: raise ValueError(('sparse CSR array has {} non-zero ' 'elements and requires 64 bit indexing, ' diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index f9bab27fca5e6..5f704b1090188 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -5,6 +5,7 @@ from ..metrics import check_scoring from ..utils import Bunch +from ..utils import MAX_INT_32 from ..utils import check_random_state from ..utils import check_array from ..utils import _safe_indexing @@ -105,7 +106,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). random_state = check_random_state(random_state) - random_seed = random_state.randint(0, 2 ** 32) + random_seed = random_state.randint(MAX_INT_32) scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 9e45edd6bb063..ca29f4b30f48d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -30,6 +30,7 @@ from ..base import is_classifier from ..base import MultiOutputMixin from ..utils import Bunch +from ..utils import MAX_INT_32 from ..utils import check_array from ..utils import check_random_state from ..utils.validation import _check_sample_weight @@ -197,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = ((2 ** 31) - 1 if self.max_depth is None + max_depth = (MAX_INT_32 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 82abff2b12183..d36b9c4ddc46d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -56,6 +56,8 @@ IS_PYPY = platform.python_implementation() == 'PyPy' _IS_32BIT = 8 * struct.calcsize("P") == 32 +MAX_INT_32 = np.iinfo(np.int32).max + class Bunch(dict): """Container object for datasets From 9cdc7b88b73d591de50acb53b52062576d448231 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:16:52 +0100 Subject: [PATCH 21/31] make max_int_32 inclusive --- sklearn/inspection/_permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 5f704b1090188..d726e11430058 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -106,7 +106,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). random_state = check_random_state(random_state) - random_seed = random_state.randint(MAX_INT_32) + random_seed = random_state.randint(0, MAX_INT_32) scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) From 03ab3a1016c2224c3cedc604c79323d10e6c1792 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:25:41 +0100 Subject: [PATCH 22/31] explicitly call for max int32 --- sklearn/ensemble/_base.py | 3 +-- sklearn/feature_extraction/text.py | 3 +-- sklearn/inspection/_permutation_importance.py | 3 +-- sklearn/tree/_classes.py | 3 +-- sklearn/utils/__init__.py | 2 -- 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 2b097828f59b2..9c6d8cbce0206 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -16,7 +16,6 @@ from ..base import BaseEstimator from ..base import MetaEstimatorMixin from ..utils import Bunch -from ..utils import MAX_INT_32 from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition @@ -70,7 +69,7 @@ def _set_random_states(estimator, random_state=None): to_set = {} for key in sorted(estimator.get_params(deep=True)): if key == 'random_state' or key.endswith('__random_state'): - to_set[key] = random_state.randint(MAX_INT_32) + to_set[key] = random_state.randint(np.iinfo(np.int32).max) if to_set: estimator.set_params(**to_set) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9f9c05ad7fd90..9771c62204444 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,7 +30,6 @@ from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES -from ..utils import MAX_INT_32 from ..utils import _IS_32BIT, deprecated from ..utils.fixes import _astype_copy_false from ..exceptions import ChangedBehaviorWarning, NotFittedError @@ -1151,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - if indptr[-1] > MAX_INT_32: # = 2**31 - 1 + if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1 if _IS_32BIT: raise ValueError(('sparse CSR array has {} non-zero ' 'elements and requires 64 bit indexing, ' diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d726e11430058..d8d7d634340eb 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -5,7 +5,6 @@ from ..metrics import check_scoring from ..utils import Bunch -from ..utils import MAX_INT_32 from ..utils import check_random_state from ..utils import check_array from ..utils import _safe_indexing @@ -106,7 +105,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). random_state = check_random_state(random_state) - random_seed = random_state.randint(0, MAX_INT_32) + random_seed = random_state.randint(0, np.iinfo(np.int32).max) scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index ca29f4b30f48d..fa9944c63b5d2 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -30,7 +30,6 @@ from ..base import is_classifier from ..base import MultiOutputMixin from ..utils import Bunch -from ..utils import MAX_INT_32 from ..utils import check_array from ..utils import check_random_state from ..utils.validation import _check_sample_weight @@ -198,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = (MAX_INT_32 if self.max_depth is None + max_depth = (np.iinfo(np.int32).max if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index d36b9c4ddc46d..82abff2b12183 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -56,8 +56,6 @@ IS_PYPY = platform.python_implementation() == 'PyPy' _IS_32BIT = 8 * struct.calcsize("P") == 32 -MAX_INT_32 = np.iinfo(np.int32).max - class Bunch(dict): """Container object for datasets From 0c25e6151e80d1325fe6cd2f450f641914f07744 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:31:23 +0100 Subject: [PATCH 23/31] fix --- sklearn/inspection/_permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index d8d7d634340eb..923898e3e5f63 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -105,7 +105,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, # the fact that variables are shared or not depending on the active # joblib backend (sequential, thread-based or process-based). random_state = check_random_state(random_state) - random_seed = random_state.randint(0, np.iinfo(np.int32).max) + random_seed = random_state.randint(np.iinfo(np.int32).max + 1) scorer = check_scoring(estimator, scoring=scoring) baseline_score = scorer(estimator, X, y) From fe4cac688dba13d47912603bcf8032d4631f565a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:34:34 +0100 Subject: [PATCH 24/31] revert max int32 changes --- sklearn/ensemble/_base.py | 4 +++- sklearn/feature_extraction/text.py | 2 +- sklearn/tree/_classes.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 9c6d8cbce0206..5db30b9bbc600 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -19,6 +19,8 @@ from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition +MAX_RAND_SEED = np.iinfo(np.int32).max + def _parallel_fit_estimator(estimator, X, y, sample_weight=None): """Private function used to fit an estimator within a job.""" @@ -69,7 +71,7 @@ def _set_random_states(estimator, random_state=None): to_set = {} for key in sorted(estimator.get_params(deep=True)): if key == 'random_state' or key.endswith('__random_state'): - to_set[key] = random_state.randint(np.iinfo(np.int32).max) + to_set[key] = random_state.randint(MAX_RAND_SEED) if to_set: estimator.set_params(**to_set) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9771c62204444..2d8f7d840c55b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1150,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1 + if indptr[-1] > 2147483648: # = 2**31 - 1 if _IS_32BIT: raise ValueError(('sparse CSR array has {} non-zero ' 'elements and requires 64 bit indexing, ' diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index fa9944c63b5d2..70c2d0f5289b8 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = (np.iinfo(np.int32).max if self.max_depth is None + max_depth = ((2 ** 32) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) From 7bdb93a3cdbc61b56007e41d7f72af1b347ddf9f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 11:35:00 +0100 Subject: [PATCH 25/31] fix --- sklearn/tree/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 70c2d0f5289b8..9e45edd6bb063 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = ((2 ** 32) - 1 if self.max_depth is None + max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) From d399d96fb9463193fd537a3ebda84450cf2907ad Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 14:05:34 +0100 Subject: [PATCH 26/31] Test with non-numpy-native column --- .../tests/test_permutation_importance.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index d29a0c1e10fcd..dfd3589129d5b 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -16,6 +16,7 @@ from sklearn.impute import SimpleImputer from sklearn.inspection import permutation_importance from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale @@ -205,13 +206,34 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs): # regression test to make sure that sequential and parallel calls will # output the same results. - X, y = make_regression(n_samples=500, n_features=10, random_state=0) + X, y = make_regression(n_samples=100, n_features=5, random_state=0) X_df = pd.DataFrame(X) - lr = LinearRegression().fit(X, y) - + # Add a categorical feature that is statistical linked to y: + binner = KBinsDiscretizer(n_bins=3, encode="ordinal") + cat_column = binner.fit_transform(y.reshape(-1, 1)) + + # Concatenate the extra column to the numpy array: integer will be + # cast to float values + X = np.hstack([X, cat_column]) + assert X.dtype.kind == "f" + + # Insert extra column as a non-numpy-native dtype (while keeping backward + # compat for old numpy): + if hasattr(pd, "Categorical"): + cat_column = pd.Categorical(cat_column.ravel()) + else: + cat_column = cat_column.ravel() + new_col_idx = len(X_df.columns) + X_df[new_col_idx] = cat_column + assert X_df[new_col_idx].dtype == cat_column.dtype + + rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) + rf.fit(X, y) + + n_repeats = 3 importance_array = permutation_importance( - lr, X, y, n_repeats=5, random_state=0, n_jobs=n_jobs + rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs ) # First check that the problem is structured enough and that the model is @@ -221,7 +243,7 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs): assert imp_max - imp_min > 0.3 importance_dataframe = permutation_importance( - lr, X_df, y, n_repeats=5, random_state=0, n_jobs=n_jobs + rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs ) assert_allclose( importance_array['importances'], From bdaffb5b3e4cb44ebeb1c7f970bb55a6a7c3042c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 14:57:38 +0100 Subject: [PATCH 27/31] reshuffling by position --- sklearn/inspection/_permutation_importance.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 923898e3e5f63..1d55a73569805 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -22,12 +22,17 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, # (memmap). X.copy() on the other hand is always guaranteed to return a # writable data-structure whose columns can be shuffled inplace. X_permuted = X.copy() - # Ensure to take a view on a column of X_permuted to make shuffling inplace - column_data = _safe_indexing(X_permuted, col_idx, axis=1) - column_data = getattr(column_data, "values", column_data) scores = np.zeros(n_repeats) + shuffling_idx = np.arange(X.shape[0]) for n_round in range(n_repeats): - random_state.shuffle(column_data) + random_state.shuffle(shuffling_idx) + if hasattr(X_permuted, "iloc"): + # reset the index such that pandas reaffect by position instead of + # indices + X_permuted.iloc[:, col_idx] = X_permuted.iloc[ + shuffling_idx, col_idx].reset_index(drop=True) + else: + X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] feature_score = scorer(estimator, X_permuted, y) scores[n_round] = feature_score From 42e8cb5cfb4ec1b5cb083198168a98fa0ccee988 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Dec 2019 15:10:05 +0100 Subject: [PATCH 28/31] remove unused import --- sklearn/inspection/_permutation_importance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 1d55a73569805..c8bdb6565a95e 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -7,7 +7,6 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_array -from ..utils import _safe_indexing def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, From 1834fca5258e1fd6eb90fc775a8ce7e38d95146d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 16:11:46 +0100 Subject: [PATCH 29/31] [ci skip] typos & better comment --- sklearn/inspection/tests/test_permutation_importance.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index dfd3589129d5b..fbf59f6265e13 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -209,17 +209,17 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs): X, y = make_regression(n_samples=100, n_features=5, random_state=0) X_df = pd.DataFrame(X) - # Add a categorical feature that is statistical linked to y: + # Add a categorical feature that is statistically linked to y: binner = KBinsDiscretizer(n_bins=3, encode="ordinal") cat_column = binner.fit_transform(y.reshape(-1, 1)) - # Concatenate the extra column to the numpy array: integer will be + # Concatenate the extra column to the numpy array: integers will be # cast to float values X = np.hstack([X, cat_column]) assert X.dtype.kind == "f" # Insert extra column as a non-numpy-native dtype (while keeping backward - # compat for old numpy): + # compat for old pandas versions): if hasattr(pd, "Categorical"): cat_column = pd.Categorical(cat_column.ravel()) else: @@ -242,6 +242,8 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs): imp_max = importance_array['importances'].max() assert imp_max - imp_min > 0.3 + # Now check that importances computed on dataframe matche the values + # of those computed on the array with the same data. importance_dataframe = permutation_importance( rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs ) From 5cf37f6e47ff9d7a4e3e01a09cfb25dc8f81b687 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 18:41:36 +0100 Subject: [PATCH 30/31] TST: check dataframe with a weird index --- sklearn/inspection/tests/test_permutation_importance.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index fbf59f6265e13..2a31a031f2938 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -228,6 +228,9 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs): X_df[new_col_idx] = cat_column assert X_df[new_col_idx].dtype == cat_column.dtype + # Stich an aribtrary index to the dataframe: + X_df.index = np.arange(len(X_df)).astype(str) + rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) rf.fit(X, y) From 51f7467252d6a513cf9405928df84fd09448cdf3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Dec 2019 18:42:04 +0100 Subject: [PATCH 31/31] FIX make column permutation robust to weird indices --- sklearn/inspection/_permutation_importance.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index c8bdb6565a95e..80bf4d2e2a62c 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -26,10 +26,9 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, for n_round in range(n_repeats): random_state.shuffle(shuffling_idx) if hasattr(X_permuted, "iloc"): - # reset the index such that pandas reaffect by position instead of - # indices - X_permuted.iloc[:, col_idx] = X_permuted.iloc[ - shuffling_idx, col_idx].reset_index(drop=True) + col = X_permuted.iloc[shuffling_idx, col_idx] + col.index = X_permuted.index + X_permuted.iloc[:, col_idx] = col else: X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] feature_score = scorer(estimator, X_permuted, y)