From a47ebe2e479d3cb6e247b046d6a0954e77453729 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 17:13:54 +0100
Subject: [PATCH 01/31] BUG ensure that parallel/sequential provide the same
 results

---
 sklearn/inspection/_permutation_importance.py |  9 ++++----
 .../tests/test_permutation_importance.py      | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d71d5fd3f3a68..015ae10af5711 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -28,6 +28,7 @@ def _safe_column_indexing(X, col_idx):
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
                                   n_repeats, scorer):
     """Calculate score when `col_idx` is permuted."""
+    random_state = check_random_state(random_state)
     original_feature = _safe_column_indexing(X, col_idx).copy()
     temp = original_feature.copy()
 
@@ -110,15 +111,15 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
         X = check_array(X, force_all_finite='allow-nan', dtype=np.object,
                         copy=True)
 
+    MAX_RAND_SEED = np.iinfo(np.int32).max
     random_state = check_random_state(random_state)
+    many_random_state = random_state.randint(MAX_RAND_SEED, size=X.shape[0])
     scorer = check_scoring(estimator, scoring=scoring)
-
     baseline_score = scorer(estimator, X, y)
-    scores = np.zeros((X.shape[1], n_repeats))
 
     scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
-        estimator, X, y, col_idx, random_state, n_repeats, scorer
-    ) for col_idx in range(X.shape[1]))
+        estimator, X, y, col_idx, rand_int, n_repeats, scorer
+    ) for rand_int, col_idx in zip(many_random_state, range(X.shape[1])))
 
     importances = baseline_score - np.array(scores)
     return Bunch(importances_mean=np.mean(importances, axis=1),
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 671a1e11b1fec..49b574e51e4be 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -150,3 +150,24 @@ def test_permutation_importance_linear_regresssion():
                                      scoring='neg_mean_squared_error')
     assert_allclose(expected_importances, results.importances_mean,
                     rtol=1e-1, atol=1e-6)
+
+
+def test_permutation_importance_equivalence_sequential_paralell():
+    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+
+    X = scale(X)
+    y = scale(y)
+
+    lr = LinearRegression().fit(X, y)
+
+    importance_parallel = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+    )
+    importance_sequential = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=1
+    )
+
+    assert_allclose(
+        importance_parallel['importances'],
+        importance_sequential['importances']
+    )

From ab95fd53fe1668bfd04a973e286bda6813550c79 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 17:26:34 +0100
Subject: [PATCH 02/31] iter

---
 sklearn/inspection/_permutation_importance.py        | 12 +++++++++---
 .../inspection/tests/test_permutation_importance.py  |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 015ae10af5711..7e326ce61dab7 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -111,15 +111,21 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
         X = check_array(X, force_all_finite='allow-nan', dtype=np.object,
                         copy=True)
 
+    # Precompute random seed from the random state to be used
+    # to get a fresh independent RandomState instance for each
+    # parallel call to _calculate_permutation_scores, irrespective of
+    # the fact that variables are shared or not depending on the active
+    # joblib backend (sequential, thread-based or process-based).
     MAX_RAND_SEED = np.iinfo(np.int32).max
     random_state = check_random_state(random_state)
-    many_random_state = random_state.randint(MAX_RAND_SEED, size=X.shape[0])
+    random_seed = random_state.randint(0, MAX_RAND_SEED)
+
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)
 
     scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
-        estimator, X, y, col_idx, rand_int, n_repeats, scorer
-    ) for rand_int, col_idx in zip(many_random_state, range(X.shape[1])))
+        estimator, X, y, col_idx, random_seed, n_repeats, scorer
+    ) for col_idx in range(X.shape[1]))
 
     importances = baseline_score - np.array(scores)
     return Bunch(importances_mean=np.mean(importances, axis=1),
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 49b574e51e4be..b67598c82e3d9 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -153,6 +153,8 @@ def test_permutation_importance_linear_regresssion():
 
 
 def test_permutation_importance_equivalence_sequential_paralell():
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
     X, y = make_regression(n_samples=500, n_features=10, random_state=0)
 
     X = scale(X)

From d58272da3f33471fa20134c9e136c06d74ef6faa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 17:30:00 +0100
Subject: [PATCH 03/31] whats new

---
 doc/whats_new/v0.22.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 19a8327783b20..d1e18e23f70b5 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -15,6 +15,14 @@ This is a bug-fix release to primarily resolve some packaging issues in version
 Changelog
 ---------
 
+:mod:`sklearn.inspection`
+.........................
+
+- |Fix| :func:`inspection.permutation_importance` will return the same
+  `importances` when a `random_state` is given for both `n_jobs=1` or
+  `n_jobs>1`.
+  :pr:`15993` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.metrics`
 ......................
 

From 7211a53d1c2eefc408905b2e9222e7f832dec0da Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 18:24:39 +0100
Subject: [PATCH 04/31] Check that the test is not trivial

---
 sklearn/inspection/tests/test_permutation_importance.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index b67598c82e3d9..a3bcf5c410b38 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -169,6 +169,13 @@ def test_permutation_importance_equivalence_sequential_paralell():
         lr, X, y, n_repeats=5, random_state=0, n_jobs=1
     )
 
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_sequential['importances'].min()
+    imp_max = importance_sequential['importances'].max()
+    assert imp_max - imp_min > 0.3
+
+    # The actually check that parallelism does not impact the results:
     assert_allclose(
         importance_parallel['importances'],
         importance_sequential['importances']

From 70e1ef433ff955130d88c31a51557231772f3c9e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 18:40:39 +0100
Subject: [PATCH 05/31] Typo in PR number

---
 doc/whats_new/v0.22.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index d1e18e23f70b5..b9e926bf8d702 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -21,7 +21,7 @@ Changelog
 - |Fix| :func:`inspection.permutation_importance` will return the same
   `importances` when a `random_state` is given for both `n_jobs=1` or
   `n_jobs>1`.
-  :pr:`15993` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.metrics`
 ......................

From a6909ca6fe95af6183552975e2657b35045a7ce6 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 19:26:55 +0100
Subject: [PATCH 06/31] Fix thread-safety issue

---
 sklearn/inspection/_permutation_importance.py | 22 ++++++++---------
 .../tests/test_permutation_importance.py      | 24 +++++++++++++++----
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 7e326ce61dab7..d8475ed5ebf7a 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -29,17 +29,18 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
                                   n_repeats, scorer):
     """Calculate score when `col_idx` is permuted."""
     random_state = check_random_state(random_state)
-    original_feature = _safe_column_indexing(X, col_idx).copy()
-    temp = original_feature.copy()
 
+    # Work on a copy of X to to ensure thread-safety in case of threading
+    # based parallelism:
+    X_permuted = X.copy()
+    column_data = _safe_column_indexing(X_permuted, col_idx)
     scores = np.zeros(n_repeats)
     for n_round in range(n_repeats):
-        random_state.shuffle(temp)
-        _safe_column_setting(X, col_idx, temp)
-        feature_score = scorer(estimator, X, y)
+        random_state.shuffle(column_data)
+        _safe_column_setting(X_permuted, col_idx, column_data)
+        feature_score = scorer(estimator, X_permuted, y)
         scores[n_round] = feature_score
 
-    _safe_column_setting(X, col_idx, original_feature)
     return scores
 
 
@@ -105,18 +106,15 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
              2001. https://doi.org/10.1023/A:1010933404324
     """
-    if hasattr(X, "iloc"):
-        X = X.copy()  # Dataframe
-    else:
-        X = check_array(X, force_all_finite='allow-nan', dtype=np.object,
-                        copy=True)
+    if not hasattr(X, "iloc"):
+        X = check_array(X, force_all_finite='allow-nan', dtype=None)
 
     # Precompute random seed from the random state to be used
     # to get a fresh independent RandomState instance for each
     # parallel call to _calculate_permutation_scores, irrespective of
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
-    MAX_RAND_SEED = np.iinfo(np.int32).max
+    MAX_RAND_SEED = np.iinfo(np.uint32).max
     random_state = check_random_state(random_state)
     random_seed = random_state.randint(0, MAX_RAND_SEED)
 
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index a3bcf5c410b38..66a41407e6919 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -17,6 +17,8 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
+from sklearn.utils import parallel_backend
+
 
 @pytest.mark.parametrize("n_jobs", [1, 2])
 def test_permutation_importance_correlated_feature_regression(n_jobs):
@@ -162,9 +164,6 @@ def test_permutation_importance_equivalence_sequential_paralell():
 
     lr = LinearRegression().fit(X, y)
 
-    importance_parallel = permutation_importance(
-        lr, X, y, n_repeats=5, random_state=0, n_jobs=2
-    )
     importance_sequential = permutation_importance(
         lr, X, y, n_repeats=5, random_state=0, n_jobs=1
     )
@@ -175,8 +174,23 @@ def test_permutation_importance_equivalence_sequential_paralell():
     imp_max = importance_sequential['importances'].max()
     assert imp_max - imp_min > 0.3
 
-    # The actually check that parallelism does not impact the results:
+    # The actually check that parallelism does not impact the results
+    # either with shared memory (threading) or without isolated memory
+    # via process-based parallelism using loky:
+    with parallel_backend("threading"):
+        importance_threading = permutation_importance(
+            lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+        )
+    assert_allclose(
+        importance_threading['importances'],
+        importance_sequential['importances']
+    )
+
+    with parallel_backend("loky"):
+        importance_loky = permutation_importance(
+            lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+        )
     assert_allclose(
-        importance_parallel['importances'],
+        importance_loky['importances'],
         importance_sequential['importances']
     )

From a24b9d55286acf5047a7fd441823cb0232442581 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 19:37:51 +0100
Subject: [PATCH 07/31] Add non-regression test to check that issue 15810 is
 fixed.

---
 doc/whats_new/v0.22.rst                       |  7 +++--
 .../tests/test_permutation_importance.py      | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index b9e926bf8d702..418680a6808cf 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -20,8 +20,11 @@ Changelog
 
 - |Fix| :func:`inspection.permutation_importance` will return the same
   `importances` when a `random_state` is given for both `n_jobs=1` or
-  `n_jobs>1`.
-  :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>`.
+  `n_jobs>1` both with shared memory backends (thread-safety) and
+  isolated memory, process-based backends.
+  Also avoid casting the data as object dtype and avoid read-only error
+  on large dataframes with `n_jobs>1` as reported in :issue:`15810`.
+  :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>` and `Olivier Grisel`_.
 
 :mod:`sklearn.metrics`
 ......................
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 66a41407e6919..bc7f2ca75c5e1 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -6,7 +6,9 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.datasets import load_boston
 from sklearn.datasets import load_iris
+from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
+from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
@@ -194,3 +196,30 @@ def test_permutation_importance_equivalence_sequential_paralell():
         importance_loky['importances'],
         importance_sequential['importances']
     )
+
+
+@pytest.mark.parametrize("input_type", ["array", "dataframe"])
+def test_permutation_importance_large_memmaped_data(input_type):
+    # Smoke, non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/15810
+    n_samples, n_features = int(5e4), 4
+    X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                               random_state=0)
+    assert X.nbytes > 1e6  # trigger joblib memmaping
+
+    if input_type == "dataframe":
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame(X)
+    else:
+        assert input_type == "array"
+
+    clf = DummyClassifier(strategy='prior').fit(X, y)
+
+    # Actual smoke test: should not raise any error:
+    n_repeats = 5
+    r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
+
+    # Auxiliary check: dummy classifier is feature indpendent:
+    # permutating feature should not change the predictions
+    expected_importances = np.zeros((n_features, n_repeats))
+    assert_allclose(expected_importances, r.importances)

From 7ad0b26341fe6e29d12271aadb3a4856cb807d4c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 19:50:27 +0100
Subject: [PATCH 08/31] Leaner test

---
 sklearn/inspection/tests/test_permutation_importance.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index bc7f2ca75c5e1..27ca369a6d138 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -160,10 +160,6 @@ def test_permutation_importance_equivalence_sequential_paralell():
     # regression test to make sure that sequential and parallel calls will
     # output the same results.
     X, y = make_regression(n_samples=500, n_features=10, random_state=0)
-
-    X = scale(X)
-    y = scale(y)
-
     lr = LinearRegression().fit(X, y)
 
     importance_sequential = permutation_importance(

From fb69870852627844056b91910e639bd6d73d72d4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 19 Dec 2019 20:12:55 +0100
Subject: [PATCH 09/31] Support joblib 0.11

---
 .../tests/test_permutation_importance.py      | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 27ca369a6d138..892ebc25a4a0a 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -174,22 +174,24 @@ def test_permutation_importance_equivalence_sequential_paralell():
 
     # The actually check that parallelism does not impact the results
     # either with shared memory (threading) or without isolated memory
-    # via process-based parallelism using loky:
-    with parallel_backend("threading"):
-        importance_threading = permutation_importance(
-            lr, X, y, n_repeats=5, random_state=0, n_jobs=2
-        )
+    # via process-based parallelism using the default backend
+    # ('loky' or 'multiprocessing') depending on the joblib version:
+
+    # process-based parallelism (by default):
+    importance_processes = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=2
     assert_allclose(
-        importance_threading['importances'],
+        importance_processes['importances'],
         importance_sequential['importances']
     )
 
-    with parallel_backend("loky"):
-        importance_loky = permutation_importance(
+    # thread-based parallelism:
+    with parallel_backend("threading"):
+        importance_threading = permutation_importance(
             lr, X, y, n_repeats=5, random_state=0, n_jobs=2
         )
     assert_allclose(
-        importance_loky['importances'],
+        importance_threading['importances'],
         importance_sequential['importances']
     )
 
@@ -215,7 +217,7 @@ def test_permutation_importance_large_memmaped_data(input_type):
     n_repeats = 5
     r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
 
-    # Auxiliary check: dummy classifier is feature indpendent:
+    # Auxiliary check: DummyClassifier is feature independent:
     # permutating feature should not change the predictions
     expected_importances = np.zeros((n_features, n_repeats))
     assert_allclose(expected_importances, r.importances)

From f80dc6946798a8f25cdc348ff61d551ef574da2b Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 19 Dec 2019 15:08:07 -0500
Subject: [PATCH 10/31] BUG Syntax error

---
 sklearn/inspection/tests/test_permutation_importance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 892ebc25a4a0a..2435c65121a17 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -179,7 +179,7 @@ def test_permutation_importance_equivalence_sequential_paralell():
 
     # process-based parallelism (by default):
     importance_processes = permutation_importance(
-        lr, X, y, n_repeats=5, random_state=0, n_jobs=2
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=2)
     assert_allclose(
         importance_processes['importances'],
         importance_sequential['importances']

From 775e98698ed62045893782894788403aa6b4c1d2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 21:40:44 +0100
Subject: [PATCH 11/31] MAX_RAND_SEED should be int32

---
 sklearn/inspection/_permutation_importance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d8475ed5ebf7a..e9849926b1deb 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -114,7 +114,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # parallel call to _calculate_permutation_scores, irrespective of
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
-    MAX_RAND_SEED = np.iinfo(np.uint32).max
+    MAX_RAND_SEED = np.iinfo(np.int32).max
     random_state = check_random_state(random_state)
     random_seed = random_state.randint(0, MAX_RAND_SEED)
 

From 063129921d4074fb97a687ebb65f8ef6542e8a73 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 22:18:59 +0100
Subject: [PATCH 12/31] cosmetic

---
 sklearn/inspection/_permutation_importance.py | 26 +++++--------------
 .../tests/test_permutation_importance.py      | 10 +++----
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index e9849926b1deb..8c5c197548fc5 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -4,25 +4,10 @@
 from joblib import delayed
 
 from ..metrics import check_scoring
+from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
-from ..utils import Bunch
-
-
-def _safe_column_setting(X, col_idx, values):
-    """Set column on X using `col_idx`"""
-    if hasattr(X, "iloc"):
-        X.iloc[:, col_idx] = values
-    else:
-        X[:, col_idx] = values
-
-
-def _safe_column_indexing(X, col_idx):
-    """Return column from X using `col_idx`"""
-    if hasattr(X, "iloc"):
-        return X.iloc[:, col_idx].values
-    else:
-        return X[:, col_idx]
+from ..utils import _safe_indexing
 
 
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
@@ -33,11 +18,14 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     # Work on a copy of X to to ensure thread-safety in case of threading
     # based parallelism:
     X_permuted = X.copy()
-    column_data = _safe_column_indexing(X_permuted, col_idx)
+    column_data = np.asarray(_safe_indexing(X_permuted, col_idx, axis=1))
     scores = np.zeros(n_repeats)
     for n_round in range(n_repeats):
         random_state.shuffle(column_data)
-        _safe_column_setting(X_permuted, col_idx, column_data)
+        if hasattr(X_permuted, "iloc"):
+            X_permuted.iloc[:, col_idx] = column_data
+        else:
+            X_permuted[:, col_idx] = column_data
         feature_score = scorer(estimator, X_permuted, y)
         scores[n_round] = feature_score
 
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 2435c65121a17..a27941c0a5625 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -20,6 +20,7 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
 from sklearn.utils import parallel_backend
+from sklearn.utils._testing import _convert_container
 
 
 @pytest.mark.parametrize("n_jobs", [1, 2])
@@ -156,7 +157,7 @@ def test_permutation_importance_linear_regresssion():
                     rtol=1e-1, atol=1e-6)
 
 
-def test_permutation_importance_equivalence_sequential_paralell():
+def test_permutation_importance_equivalence_sequential_parallel():
     # regression test to make sure that sequential and parallel calls will
     # output the same results.
     X, y = make_regression(n_samples=500, n_features=10, random_state=0)
@@ -205,12 +206,7 @@ def test_permutation_importance_large_memmaped_data(input_type):
                                random_state=0)
     assert X.nbytes > 1e6  # trigger joblib memmaping
 
-    if input_type == "dataframe":
-        pd = pytest.importorskip("pandas")
-        X = pd.DataFrame(X)
-    else:
-        assert input_type == "array"
-
+    X = _convert_container(X, input_type)
     clf = DummyClassifier(strategy='prior').fit(X, y)
 
     # Actual smoke test: should not raise any error:

From 1a21a982a6c21e423f4d91a55c4f24a6b2be134a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 23:13:51 +0100
Subject: [PATCH 13/31] inplace operation

---
 sklearn/inspection/_permutation_importance.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 8c5c197548fc5..5054279d5e15c 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -18,14 +18,13 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     # Work on a copy of X to to ensure thread-safety in case of threading
     # based parallelism:
     X_permuted = X.copy()
-    column_data = np.asarray(_safe_indexing(X_permuted, col_idx, axis=1))
+    # Ensure to take a view on a column of X_permuted to make shuffling inplace
+    column_data = _safe_indexing(X_permuted, col_idx, axis=1)
+    if hasattr(X_permuted, "iloc"):
+        column_data = column_data.values
     scores = np.zeros(n_repeats)
     for n_round in range(n_repeats):
         random_state.shuffle(column_data)
-        if hasattr(X_permuted, "iloc"):
-            X_permuted.iloc[:, col_idx] = column_data
-        else:
-            X_permuted[:, col_idx] = column_data
         feature_score = scorer(estimator, X_permuted, y)
         scores[n_round] = feature_score
 

From 023eca27472170e3c67341f01137a314839f46eb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 23:17:27 +0100
Subject: [PATCH 14/31] cosmit

---
 sklearn/inspection/_permutation_importance.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 5054279d5e15c..82abf1960d0c2 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -20,8 +20,9 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     X_permuted = X.copy()
     # Ensure to take a view on a column of X_permuted to make shuffling inplace
     column_data = _safe_indexing(X_permuted, col_idx, axis=1)
-    if hasattr(X_permuted, "iloc"):
-        column_data = column_data.values
+    column_data = getattr(column_data, "values", column_data)
+    # if hasattr(X_permuted, "iloc"):
+    #     column_data = column_data.values
     scores = np.zeros(n_repeats)
     for n_round in range(n_repeats):
         random_state.shuffle(column_data)

From e21323627c03d76a25f1307ff3cb9ccd38fcc111 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Dec 2019 23:18:48 +0100
Subject: [PATCH 15/31] cosmit

---
 sklearn/inspection/_permutation_importance.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 82abf1960d0c2..d959267132744 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -21,8 +21,6 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     # Ensure to take a view on a column of X_permuted to make shuffling inplace
     column_data = _safe_indexing(X_permuted, col_idx, axis=1)
     column_data = getattr(column_data, "values", column_data)
-    # if hasattr(X_permuted, "iloc"):
-    #     column_data = column_data.values
     scores = np.zeros(n_repeats)
     for n_round in range(n_repeats):
         random_state.shuffle(column_data)

From be8f1c11dceb550605850e7b1406b5fe5701adaf Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 01:19:28 +0100
Subject: [PATCH 16/31] Better comment explaining the need for X.copy()

---
 sklearn/inspection/_permutation_importance.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d959267132744..3c7f959577bf9 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -15,8 +15,12 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     """Calculate score when `col_idx` is permuted."""
     random_state = check_random_state(random_state)
 
-    # Work on a copy of X to to ensure thread-safety in case of threading
-    # based parallelism:
+    # Work on a copy of X to to ensure thread-safety in case of threading based
+    # parallelism. Furthermore, making a copy is also useful when the joblib
+    # backend is 'loky' (default) or the old 'multiprocessing': in those cases,
+    # if X is large it will be automatically be backed by a readonly memory map
+    # (memmap). X.copy() on the other hand is always guaranteed to return a
+    # writable data-structure whose columns can be shuffled inplace.
     X_permuted = X.copy()
     # Ensure to take a view on a column of X_permuted to make shuffling inplace
     column_data = _safe_indexing(X_permuted, col_idx, axis=1)

From 910ef4f6bb5dbf32e8f287b6de8d6499d4498dba Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 09:29:03 +0100
Subject: [PATCH 17/31] Fix random seed range

---
 sklearn/inspection/_permutation_importance.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 3c7f959577bf9..f9bab27fca5e6 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -104,9 +104,8 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # parallel call to _calculate_permutation_scores, irrespective of
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
-    MAX_RAND_SEED = np.iinfo(np.int32).max
     random_state = check_random_state(random_state)
-    random_seed = random_state.randint(0, MAX_RAND_SEED)
+    random_seed = random_state.randint(0, 2 ** 32)
 
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)

From 723bf03313b4c1380cf91d92c52832db1466d56b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 09:44:34 +0100
Subject: [PATCH 18/31] Test exact equivalence in column shuffling of pandas
 dataframes with numpy arrays

---
 .../tests/test_permutation_importance.py      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index a27941c0a5625..d29a0c1e10fcd 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -197,6 +197,38 @@ def test_permutation_importance_equivalence_sequential_parallel():
     )
 
 
+@pytest.mark.parametrize("n_jobs", [None, 1, 2])
+def test_permutation_importance_equivalence_array_dataframe(n_jobs):
+    # This test checks that the column shuffling logic has the same behavior
+    # both a dataframe and a simple numpy array.
+    pd = pytest.importorskip('pandas')
+
+    # regression test to make sure that sequential and parallel calls will
+    # output the same results.
+    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    X_df = pd.DataFrame(X)
+
+    lr = LinearRegression().fit(X, y)
+
+    importance_array = permutation_importance(
+        lr, X, y, n_repeats=5, random_state=0, n_jobs=n_jobs
+    )
+
+    # First check that the problem is structured enough and that the model is
+    # complex enough to not yield trivial, constant importances:
+    imp_min = importance_array['importances'].min()
+    imp_max = importance_array['importances'].max()
+    assert imp_max - imp_min > 0.3
+
+    importance_dataframe = permutation_importance(
+        lr, X_df, y, n_repeats=5, random_state=0, n_jobs=n_jobs
+    )
+    assert_allclose(
+        importance_array['importances'],
+        importance_dataframe['importances']
+    )
+
+
 @pytest.mark.parametrize("input_type", ["array", "dataframe"])
 def test_permutation_importance_large_memmaped_data(input_type):
     # Smoke, non-regression test for:

From f5bda8ce6c2b71b675613c1220c1f9d8daf07704 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:02:42 +0100
Subject: [PATCH 19/31] Add acknowledgment to 15898

---
 doc/whats_new/v0.22.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 418680a6808cf..69fb98a651225 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -24,6 +24,7 @@ Changelog
   isolated memory, process-based backends.
   Also avoid casting the data as object dtype and avoid read-only error
   on large dataframes with `n_jobs>1` as reported in :issue:`15810`.
+  Follow-up of :pr:`15898` by :user:`Shivam Gargsya <shivamgargsya>`.
   :pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>` and `Olivier Grisel`_.
 
 :mod:`sklearn.metrics`

From e9770cf82387f74da543403cd52927f795b0de62 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:05:21 +0100
Subject: [PATCH 20/31] factorize max_int_32

---
 sklearn/ensemble/_base.py                     | 5 ++---
 sklearn/feature_extraction/text.py            | 3 ++-
 sklearn/inspection/_permutation_importance.py | 3 ++-
 sklearn/tree/_classes.py                      | 3 ++-
 sklearn/utils/__init__.py                     | 2 ++
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 5db30b9bbc600..2b097828f59b2 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -16,11 +16,10 @@
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
 from ..utils import Bunch
+from ..utils import MAX_INT_32
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
 
-MAX_RAND_SEED = np.iinfo(np.int32).max
-
 
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     """Private function used to fit an estimator within a job."""
@@ -71,7 +70,7 @@ def _set_random_states(estimator, random_state=None):
     to_set = {}
     for key in sorted(estimator.get_params(deep=True)):
         if key == 'random_state' or key.endswith('__random_state'):
-            to_set[key] = random_state.randint(MAX_RAND_SEED)
+            to_set[key] = random_state.randint(MAX_INT_32)
 
     if to_set:
         estimator.set_params(**to_set)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 2d8f7d840c55b..9f9c05ad7fd90 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -30,6 +30,7 @@
 from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
+from ..utils import MAX_INT_32
 from ..utils import _IS_32BIT, deprecated
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import ChangedBehaviorWarning, NotFittedError
@@ -1150,7 +1151,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        if indptr[-1] > 2147483648:  # = 2**31 - 1
+        if indptr[-1] > MAX_INT_32:  # = 2**31 - 1
             if _IS_32BIT:
                 raise ValueError(('sparse CSR array has {} non-zero '
                                   'elements and requires 64 bit indexing, '
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index f9bab27fca5e6..5f704b1090188 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -5,6 +5,7 @@
 
 from ..metrics import check_scoring
 from ..utils import Bunch
+from ..utils import MAX_INT_32
 from ..utils import check_random_state
 from ..utils import check_array
 from ..utils import _safe_indexing
@@ -105,7 +106,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
     random_state = check_random_state(random_state)
-    random_seed = random_state.randint(0, 2 ** 32)
+    random_seed = random_state.randint(MAX_INT_32)
 
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 9e45edd6bb063..ca29f4b30f48d 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -30,6 +30,7 @@
 from ..base import is_classifier
 from ..base import MultiOutputMixin
 from ..utils import Bunch
+from ..utils import MAX_INT_32
 from ..utils import check_array
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight
@@ -197,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Check parameters
-        max_depth = ((2 ** 31) - 1 if self.max_depth is None
+        max_depth = (MAX_INT_32 if self.max_depth is None
                      else self.max_depth)
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 82abff2b12183..d36b9c4ddc46d 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -56,6 +56,8 @@
 IS_PYPY = platform.python_implementation() == 'PyPy'
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
+MAX_INT_32 = np.iinfo(np.int32).max
+
 
 class Bunch(dict):
     """Container object for datasets

From 9cdc7b88b73d591de50acb53b52062576d448231 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:16:52 +0100
Subject: [PATCH 21/31] make max_int_32 inclusive

---
 sklearn/inspection/_permutation_importance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 5f704b1090188..d726e11430058 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -106,7 +106,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
     random_state = check_random_state(random_state)
-    random_seed = random_state.randint(MAX_INT_32)
+    random_seed = random_state.randint(0, MAX_INT_32)
 
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)

From 03ab3a1016c2224c3cedc604c79323d10e6c1792 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:25:41 +0100
Subject: [PATCH 22/31] explicitly call for max int32

---
 sklearn/ensemble/_base.py                     | 3 +--
 sklearn/feature_extraction/text.py            | 3 +--
 sklearn/inspection/_permutation_importance.py | 3 +--
 sklearn/tree/_classes.py                      | 3 +--
 sklearn/utils/__init__.py                     | 2 --
 5 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 2b097828f59b2..9c6d8cbce0206 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -16,7 +16,6 @@
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
 from ..utils import Bunch
-from ..utils import MAX_INT_32
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
 
@@ -70,7 +69,7 @@ def _set_random_states(estimator, random_state=None):
     to_set = {}
     for key in sorted(estimator.get_params(deep=True)):
         if key == 'random_state' or key.endswith('__random_state'):
-            to_set[key] = random_state.randint(MAX_INT_32)
+            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
 
     if to_set:
         estimator.set_params(**to_set)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 9f9c05ad7fd90..9771c62204444 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -30,7 +30,6 @@
 from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
-from ..utils import MAX_INT_32
 from ..utils import _IS_32BIT, deprecated
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import ChangedBehaviorWarning, NotFittedError
@@ -1151,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        if indptr[-1] > MAX_INT_32:  # = 2**31 - 1
+        if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
             if _IS_32BIT:
                 raise ValueError(('sparse CSR array has {} non-zero '
                                   'elements and requires 64 bit indexing, '
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d726e11430058..d8d7d634340eb 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -5,7 +5,6 @@
 
 from ..metrics import check_scoring
 from ..utils import Bunch
-from ..utils import MAX_INT_32
 from ..utils import check_random_state
 from ..utils import check_array
 from ..utils import _safe_indexing
@@ -106,7 +105,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
     random_state = check_random_state(random_state)
-    random_seed = random_state.randint(0, MAX_INT_32)
+    random_seed = random_state.randint(0, np.iinfo(np.int32).max)
 
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index ca29f4b30f48d..fa9944c63b5d2 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -30,7 +30,6 @@
 from ..base import is_classifier
 from ..base import MultiOutputMixin
 from ..utils import Bunch
-from ..utils import MAX_INT_32
 from ..utils import check_array
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight
@@ -198,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Check parameters
-        max_depth = (MAX_INT_32 if self.max_depth is None
+        max_depth = (np.iinfo(np.int32).max if self.max_depth is None
                      else self.max_depth)
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index d36b9c4ddc46d..82abff2b12183 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -56,8 +56,6 @@
 IS_PYPY = platform.python_implementation() == 'PyPy'
 _IS_32BIT = 8 * struct.calcsize("P") == 32
 
-MAX_INT_32 = np.iinfo(np.int32).max
-
 
 class Bunch(dict):
     """Container object for datasets

From 0c25e6151e80d1325fe6cd2f450f641914f07744 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:31:23 +0100
Subject: [PATCH 23/31] fix

---
 sklearn/inspection/_permutation_importance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index d8d7d634340eb..923898e3e5f63 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -105,7 +105,7 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     # the fact that variables are shared or not depending on the active
     # joblib backend (sequential, thread-based or process-based).
     random_state = check_random_state(random_state)
-    random_seed = random_state.randint(0, np.iinfo(np.int32).max)
+    random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
 
     scorer = check_scoring(estimator, scoring=scoring)
     baseline_score = scorer(estimator, X, y)

From fe4cac688dba13d47912603bcf8032d4631f565a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:34:34 +0100
Subject: [PATCH 24/31] revert max int32 changes

---
 sklearn/ensemble/_base.py          | 4 +++-
 sklearn/feature_extraction/text.py | 2 +-
 sklearn/tree/_classes.py           | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 9c6d8cbce0206..5db30b9bbc600 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -19,6 +19,8 @@
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
 
+MAX_RAND_SEED = np.iinfo(np.int32).max
+
 
 def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     """Private function used to fit an estimator within a job."""
@@ -69,7 +71,7 @@ def _set_random_states(estimator, random_state=None):
     to_set = {}
     for key in sorted(estimator.get_params(deep=True)):
         if key == 'random_state' or key.endswith('__random_state'):
-            to_set[key] = random_state.randint(np.iinfo(np.int32).max)
+            to_set[key] = random_state.randint(MAX_RAND_SEED)
 
     if to_set:
         estimator.set_params(**to_set)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 9771c62204444..2d8f7d840c55b 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1150,7 +1150,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
+        if indptr[-1] > 2147483648:  # = 2**31 - 1
             if _IS_32BIT:
                 raise ValueError(('sparse CSR array has {} non-zero '
                                   'elements and requires 64 bit indexing, '
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index fa9944c63b5d2..70c2d0f5289b8 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Check parameters
-        max_depth = (np.iinfo(np.int32).max if self.max_depth is None
+        max_depth = ((2 ** 32) - 1 if self.max_depth is None
                      else self.max_depth)
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)

From 7bdb93a3cdbc61b56007e41d7f72af1b347ddf9f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 11:35:00 +0100
Subject: [PATCH 25/31] fix

---
 sklearn/tree/_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 70c2d0f5289b8..9e45edd6bb063 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -197,7 +197,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Check parameters
-        max_depth = ((2 ** 32) - 1 if self.max_depth is None
+        max_depth = ((2 ** 31) - 1 if self.max_depth is None
                      else self.max_depth)
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)

From d399d96fb9463193fd537a3ebda84450cf2907ad Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 14:05:34 +0100
Subject: [PATCH 26/31] Test with non-numpy-native column

---
 .../tests/test_permutation_importance.py      | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index d29a0c1e10fcd..dfd3589129d5b 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -16,6 +16,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
 from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
@@ -205,13 +206,34 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs):
 
     # regression test to make sure that sequential and parallel calls will
     # output the same results.
-    X, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    X, y = make_regression(n_samples=100, n_features=5, random_state=0)
     X_df = pd.DataFrame(X)
 
-    lr = LinearRegression().fit(X, y)
-
+    # Add a categorical feature that is statistical linked to y:
+    binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
+    cat_column = binner.fit_transform(y.reshape(-1, 1))
+
+    # Concatenate the extra column to the numpy array: integer will be
+    # cast to float values
+    X = np.hstack([X, cat_column])
+    assert X.dtype.kind == "f"
+
+    # Insert extra column as a non-numpy-native dtype (while keeping backward
+    # compat for old numpy):
+    if hasattr(pd, "Categorical"):
+        cat_column = pd.Categorical(cat_column.ravel())
+    else:
+        cat_column = cat_column.ravel()
+    new_col_idx = len(X_df.columns)
+    X_df[new_col_idx] = cat_column
+    assert X_df[new_col_idx].dtype == cat_column.dtype
+
+    rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
+    rf.fit(X, y)
+
+    n_repeats = 3
     importance_array = permutation_importance(
-        lr, X, y, n_repeats=5, random_state=0, n_jobs=n_jobs
+        rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
     )
 
     # First check that the problem is structured enough and that the model is
@@ -221,7 +243,7 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs):
     assert imp_max - imp_min > 0.3
 
     importance_dataframe = permutation_importance(
-        lr, X_df, y, n_repeats=5, random_state=0, n_jobs=n_jobs
+        rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
     )
     assert_allclose(
         importance_array['importances'],

From bdaffb5b3e4cb44ebeb1c7f970bb55a6a7c3042c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 14:57:38 +0100
Subject: [PATCH 27/31] reshuffling by position

---
 sklearn/inspection/_permutation_importance.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 923898e3e5f63..1d55a73569805 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -22,12 +22,17 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     # (memmap). X.copy() on the other hand is always guaranteed to return a
     # writable data-structure whose columns can be shuffled inplace.
     X_permuted = X.copy()
-    # Ensure to take a view on a column of X_permuted to make shuffling inplace
-    column_data = _safe_indexing(X_permuted, col_idx, axis=1)
-    column_data = getattr(column_data, "values", column_data)
     scores = np.zeros(n_repeats)
+    shuffling_idx = np.arange(X.shape[0])
     for n_round in range(n_repeats):
-        random_state.shuffle(column_data)
+        random_state.shuffle(shuffling_idx)
+        if hasattr(X_permuted, "iloc"):
+            # reset the index such that pandas reaffect by position instead of
+            # indices
+            X_permuted.iloc[:, col_idx] = X_permuted.iloc[
+                shuffling_idx, col_idx].reset_index(drop=True)
+        else:
+            X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
         feature_score = scorer(estimator, X_permuted, y)
         scores[n_round] = feature_score
 

From 42e8cb5cfb4ec1b5cb083198168a98fa0ccee988 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Dec 2019 15:10:05 +0100
Subject: [PATCH 28/31] remove unused import

---
 sklearn/inspection/_permutation_importance.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 1d55a73569805..c8bdb6565a95e 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -7,7 +7,6 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
-from ..utils import _safe_indexing
 
 
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,

From 1834fca5258e1fd6eb90fc775a8ce7e38d95146d Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 16:11:46 +0100
Subject: [PATCH 29/31] [ci skip] typos & better comment

---
 sklearn/inspection/tests/test_permutation_importance.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index dfd3589129d5b..fbf59f6265e13 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -209,17 +209,17 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs):
     X, y = make_regression(n_samples=100, n_features=5, random_state=0)
     X_df = pd.DataFrame(X)
 
-    # Add a categorical feature that is statistical linked to y:
+    # Add a categorical feature that is statistically linked to y:
     binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
     cat_column = binner.fit_transform(y.reshape(-1, 1))
 
-    # Concatenate the extra column to the numpy array: integer will be
+    # Concatenate the extra column to the numpy array: integers will be
     # cast to float values
     X = np.hstack([X, cat_column])
     assert X.dtype.kind == "f"
 
     # Insert extra column as a non-numpy-native dtype (while keeping backward
-    # compat for old numpy):
+    # compat for old pandas versions):
     if hasattr(pd, "Categorical"):
         cat_column = pd.Categorical(cat_column.ravel())
     else:
@@ -242,6 +242,8 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs):
     imp_max = importance_array['importances'].max()
     assert imp_max - imp_min > 0.3
 
+    # Now check that importances computed on dataframe matche the values
+    # of those computed on the array with the same data.
     importance_dataframe = permutation_importance(
         rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
     )

From 5cf37f6e47ff9d7a4e3e01a09cfb25dc8f81b687 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 18:41:36 +0100
Subject: [PATCH 30/31] TST: check dataframe with a weird index

---
 sklearn/inspection/tests/test_permutation_importance.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index fbf59f6265e13..2a31a031f2938 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -228,6 +228,9 @@ def test_permutation_importance_equivalence_array_dataframe(n_jobs):
     X_df[new_col_idx] = cat_column
     assert X_df[new_col_idx].dtype == cat_column.dtype
 
+    # Stich an aribtrary index to the dataframe:
+    X_df.index = np.arange(len(X_df)).astype(str)
+
     rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
     rf.fit(X, y)
 

From 51f7467252d6a513cf9405928df84fd09448cdf3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 20 Dec 2019 18:42:04 +0100
Subject: [PATCH 31/31] FIX make column permutation robust to weird indices

---
 sklearn/inspection/_permutation_importance.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index c8bdb6565a95e..80bf4d2e2a62c 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -26,10 +26,9 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     for n_round in range(n_repeats):
         random_state.shuffle(shuffling_idx)
         if hasattr(X_permuted, "iloc"):
-            # reset the index such that pandas reaffect by position instead of
-            # indices
-            X_permuted.iloc[:, col_idx] = X_permuted.iloc[
-                shuffling_idx, col_idx].reset_index(drop=True)
+            col = X_permuted.iloc[shuffling_idx, col_idx]
+            col.index = X_permuted.index
+            X_permuted.iloc[:, col_idx] = col
         else:
             X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
         feature_score = scorer(estimator, X_permuted, y)