From ca334af087e3ab50a4227b208b145a49dfbce233 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Sat, 2 Nov 2019 15:57:59 -0700 Subject: [PATCH 01/11] documentation for random_state in forests --- sklearn/ensemble/_forest.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index a1c6d1b255523..ec93e66aab69e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -960,10 +960,11 @@ class RandomForestClassifier(ForestClassifier): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls both the randomness of the bootstrapping of the samples used + when building trees (if `bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. From 70d63d16370d83e6aa70b1d7f89394947c632f36 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Sat, 2 Nov 2019 16:01:49 -0700 Subject: [PATCH 02/11] move note to parameter --- sklearn/ensemble/_forest.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ec93e66aab69e..677f485fdfdc4 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -964,6 +964,12 @@ class RandomForestClassifier(ForestClassifier): when building trees (if `bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data, + ``max_features=n_features`` and ``bootstrap=False``, if the improvement + of the criterion is identical for several splits enumerated during the + search of the best split. To obtain a deterministic behaviour during + fitting, ``random_state`` has to be fixed. See :term:`Glossary ` for details. verbose : int, optional (default=0) @@ -1080,13 +1086,6 @@ class labels (multi-output problem). reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. - The features are always randomly permuted at each split. Therefore, - the best found split may vary, even with the same training data, - ``max_features=n_features`` and ``bootstrap=False``, if the improvement - of the criterion is identical for several splits enumerated during the - search of the best split. To obtain a deterministic behaviour during - fitting, ``random_state`` has to be fixed. - References ---------- From 8ff9ed872e5172af9ee2a39da34cb2e80498efee Mon Sep 17 00:00:00 2001 From: MDouriez Date: Sat, 2 Nov 2019 16:13:39 -0700 Subject: [PATCH 03/11] same for RandomForestRegressor --- sklearn/ensemble/_forest.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 677f485fdfdc4..5f12e58bb8acb 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -961,15 +961,15 @@ class RandomForestClassifier(ForestClassifier): random_state : int, RandomState instance or None, optional (default=None) Controls both the randomness of the bootstrapping of the samples used - when building trees (if `bootstrap=True``) and the sampling of the + when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). - The features are always randomly permuted at each split. Therefore, - the best found split may vary, even with the same training data, - ``max_features=n_features`` and ``bootstrap=False``, if the improvement - of the criterion is identical for several splits enumerated during the - search of the best split. To obtain a deterministic behaviour during - fitting, ``random_state`` has to be fixed. + Also note that the features are always randomly permuted at each split. + Therefore, the best found split may vary, even with the same training + data, ``max_features=n_features`` and ``bootstrap=False``, if the + improvement of the criterion is identical for several splits enumerated + during the search of the best split. To obtain a deterministic + behaviour during fitting, ``random_state`` has to be fixed. See :term:`Glossary ` for details. verbose : int, optional (default=0) @@ -1276,10 +1276,17 @@ class RandomForestRegressor(ForestRegressor): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + Also note that the features are always randomly permuted at each split. + Therefore, the best found split may vary, even with the same training + data, ``max_features=n_features`` and ``bootstrap=False``, if the + improvement of the criterion is identical for several splits enumerated + during the search of the best split. To obtain a deterministic + behaviour during fitting, ``random_state`` has to be fixed. + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1357,13 +1364,6 @@ class RandomForestRegressor(ForestRegressor): reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. - The features are always randomly permuted at each split. Therefore, - the best found split may vary, even with the same training data, - ``max_features=n_features`` and ``bootstrap=False``, if the improvement - of the criterion is identical for several splits enumerated during the - search of the best split. To obtain a deterministic behaviour during - fitting, ``random_state`` has to be fixed. - The default value ``max_features="auto"`` uses ``n_features`` rather than ``n_features / 3``. The latter was originally suggested in [1], whereas the former was more recently justified empirically in [2]. From 48ed17a8084b5275e479788270722b9b587f67e5 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Sat, 2 Nov 2019 16:41:09 -0700 Subject: [PATCH 04/11] add doc for ExtraTreesRegressor and ExtraTreesClassifier --- sklearn/ensemble/_forest.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 5f12e58bb8acb..899511dba6225 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1538,7 +1538,7 @@ class ExtraTreesClassifier(ForestClassifier): bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. If False, the - whole datset is used to build each tree. + whole dataset is used to build each tree. oob_score : bool, optional (default=False) Whether to use out-of-bag samples to estimate @@ -1552,10 +1552,13 @@ class ExtraTreesClassifier(ForestClassifier): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls 3 sources of randomness: + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1842,7 +1845,7 @@ class ExtraTreesRegressor(ForestRegressor): bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. If False, the - whole datset is used to build each tree. + whole dataset is used to build each tree. oob_score : bool, optional (default=False) Whether to use out-of-bag samples to estimate the R^2 on unseen data. @@ -1855,10 +1858,13 @@ class ExtraTreesRegressor(ForestRegressor): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls 3 sources of randomness: + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. From 07204b3ada5f4dea607b69e01d81201fb8ccd204 Mon Sep 17 00:00:00 2001 From: Marie Douriez Date: Mon, 18 Nov 2019 09:18:00 -0800 Subject: [PATCH 05/11] skip line --- sklearn/ensemble/_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 899511dba6225..e2ee3b52e17f7 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1553,6 +1553,7 @@ class ExtraTreesClassifier(ForestClassifier): random_state : int, RandomState instance or None, optional (default=None) Controls 3 sources of randomness: + - the bootstrapping of the samples used when building trees (if ``bootstrap=True``) - the sampling of the features to consider when looking for the best From 07ae312271879b9012e8264907916d7888a9c7e1 Mon Sep 17 00:00:00 2001 From: Marie Douriez Date: Mon, 18 Nov 2019 13:53:06 -0800 Subject: [PATCH 06/11] lint --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index e2ee3b52e17f7..63ef66387200a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1553,7 +1553,7 @@ class ExtraTreesClassifier(ForestClassifier): random_state : int, RandomState instance or None, optional (default=None) Controls 3 sources of randomness: - + - the bootstrapping of the samples used when building trees (if ``bootstrap=True``) - the sampling of the features to consider when looking for the best From 3d810f93139b90fb374c492eefeeb63fc9fed5d0 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Mon, 18 Nov 2019 19:34:05 -0800 Subject: [PATCH 07/11] move note back to where it was --- sklearn/ensemble/_forest.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 63ef66387200a..24c00cd896473 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -964,12 +964,6 @@ class RandomForestClassifier(ForestClassifier): when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). - Also note that the features are always randomly permuted at each split. - Therefore, the best found split may vary, even with the same training - data, ``max_features=n_features`` and ``bootstrap=False``, if the - improvement of the criterion is identical for several splits enumerated - during the search of the best split. To obtain a deterministic - behaviour during fitting, ``random_state`` has to be fixed. See :term:`Glossary ` for details. verbose : int, optional (default=0) @@ -1086,6 +1080,13 @@ class labels (multi-output problem). reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data, + ``max_features=n_features`` and ``bootstrap=False``, if the improvement + of the criterion is identical for several splits enumerated during the + search of the best split. To obtain a deterministic behaviour during + fitting, ``random_state`` has to be fixed. + References ---------- @@ -1280,13 +1281,6 @@ class RandomForestRegressor(ForestRegressor): when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). - Also note that the features are always randomly permuted at each split. - Therefore, the best found split may vary, even with the same training - data, ``max_features=n_features`` and ``bootstrap=False``, if the - improvement of the criterion is identical for several splits enumerated - during the search of the best split. To obtain a deterministic - behaviour during fitting, ``random_state`` has to be fixed. - See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. @@ -1364,6 +1358,13 @@ class RandomForestRegressor(ForestRegressor): reduce memory consumption, the complexity and size of the trees should be controlled by setting those parameter values. + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data, + ``max_features=n_features`` and ``bootstrap=False``, if the improvement + of the criterion is identical for several splits enumerated during the + search of the best split. To obtain a deterministic behaviour during + fitting, ``random_state`` has to be fixed. + The default value ``max_features="auto"`` uses ``n_features`` rather than ``n_features / 3``. The latter was originally suggested in [1], whereas the former was more recently justified empirically in [2]. From 27e5b62b5740ea036513bed6f919d26f48ed4ae9 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Mon, 18 Nov 2019 19:39:11 -0800 Subject: [PATCH 08/11] add Glossary in RandomForestRegressor --- sklearn/ensemble/_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 24c00cd896473..37634096418a4 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1281,6 +1281,7 @@ class RandomForestRegressor(ForestRegressor): when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. From c8fd2354f058ad2fdc68456041d3215c311b6038 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Mon, 18 Nov 2019 19:50:01 -0800 Subject: [PATCH 09/11] adding description for RandomTreesEmbedding --- sklearn/ensemble/_forest.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 37634096418a4..ab475758e80fa 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1862,6 +1862,7 @@ class ExtraTreesRegressor(ForestRegressor): random_state : int, RandomState instance or None, optional (default=None) Controls 3 sources of randomness: + - the bootstrapping of the samples used when building trees (if ``bootstrap=True``) - the sampling of the features to consider when looking for the best @@ -2094,10 +2095,14 @@ class RandomTreesEmbedding(BaseForest): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + See :term:`Glossary ` for details. verbose : int, optional (default=0) Controls the verbosity when fitting and predicting. From bcdd5de7c821789fe450dda032d14bf95f444448 Mon Sep 17 00:00:00 2001 From: MDouriez Date: Mon, 18 Nov 2019 20:24:33 -0800 Subject: [PATCH 10/11] small fix --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 20c449d363709..f07baf6dc596a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2102,7 +2102,7 @@ class RandomTreesEmbedding(BaseForest): - the bootstrapping of the samples used when building trees (if ``bootstrap=True``) - - the sampling of the features to consider when looking for the best + - the sampling of the features to consider when looking for a split at each node (if ``max_features < n_features``) - the draw of the splits for each of the `max_features` See :term:`Glossary ` for details. From 5c8c0982b509fb1c4d630288da8279739179d90b Mon Sep 17 00:00:00 2001 From: MDouriez Date: Mon, 18 Nov 2019 21:47:29 -0800 Subject: [PATCH 11/11] correct description for RandomTreesEmbedding --- sklearn/ensemble/_forest.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f07baf6dc596a..e7a0b8e56dde8 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2098,13 +2098,8 @@ class RandomTreesEmbedding(BaseForest): ` for more details. random_state : int, RandomState instance or None, optional (default=None) - Controls 3 sources of randomness: - - - the bootstrapping of the samples used when building trees - (if ``bootstrap=True``) - - the sampling of the features to consider when looking for a - split at each node (if ``max_features < n_features``) - - the draw of the splits for each of the `max_features` + Controls the generation of the random `y` used to fit the trees + and the draw of the splits for each feature at the trees' nodes. See :term:`Glossary ` for details. verbose : int, optional (default=0)