From de35beab435777bc033da42102ec2c1459235104 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Thu, 11 Dec 2014 16:22:26 -0800 Subject: [PATCH 1/8] add support for class_weights --- sklearn/ensemble/forest.py | 79 ++++++++++++++++++++++----- sklearn/ensemble/tests/test_forest.py | 47 ++++++++++++++++ sklearn/utils/estimator_checks.py | 2 + 3 files changed, 114 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index c370f6bc87340..e6acbb81fe11d 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -58,7 +58,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, check_array +from ..utils import check_random_state, check_array, compute_class_weight from ..utils.validation import DataConversionWarning from .base import BaseEnsemble, _partition_estimators @@ -122,7 +122,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + class_weight=None): super(BaseForest, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -134,6 +135,7 @@ def __init__(self, self.random_state = random_state self.verbose = verbose self.warm_start = warm_start + self.class_weight = class_weight def apply(self, X): """Apply trees in the forest to X, return leaf indices. @@ -211,11 +213,17 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] - y = self._validate_y(y) + y, cw = self._validate_y_cw(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) + if cw is not None: + if sample_weight is not None: + sample_weight *= cw + else: + sample_weight = cw + # Check parameters self._validate_estimator() @@ -279,9 +287,9 @@ def fit(self, X, y, sample_weight=None): def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" - def _validate_y(self, y): + def _validate_y_cw(self, y): # Default implementation - return y + return y, None @property def feature_importances_(self): @@ -320,7 +328,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + class_weight=None): super(ForestClassifier, self).__init__( base_estimator, @@ -331,7 +340,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + class_weight=class_weight) def _set_oob_score(self, X, y): """Compute out-of-bag score""" @@ -377,8 +387,9 @@ def _set_oob_score(self, X, y): self.oob_score_ = oob_score / self.n_outputs_ - def _validate_y(self, y): - y = np.copy(y) + def _validate_y_cw(self, y_org): + y = np.copy(y_org) + cw = None self.classes_ = [] self.n_classes_ = [] @@ -388,7 +399,19 @@ def _validate_y(self, y): self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) - return y + if self.class_weight is not None: + if self.n_outputs_ == 1: + cw = compute_class_weight(self.class_weight, + self.classes_[0], + y_org[:, 0]) + cw = cw[np.searchsorted(self.classes_[0], y_org[:, 0])] + else: + raise NotImplementedError('class_weights are not supported ' + 'for multi-output. You may use ' + 'sample_weights in the fit method ' + 'to weight by sample.') + + return y, cw def predict(self, X): """Predict class for X. @@ -707,6 +730,18 @@ class RandomForestClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. + class_weight : dict, {class_label: weight} or "auto" or None, optional + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies. + + Note that this is only supported for single-output classification. + + Note that these weights will be multiplied with class_weight (passed + through the fit method) if sample_weight is specified + Attributes ---------- estimators_ : list of DecisionTreeClassifier @@ -755,7 +790,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + class_weight=None): super(RandomForestClassifier, self).__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -768,7 +804,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + class_weight=class_weight) self.criterion = criterion self.max_depth = max_depth @@ -1017,6 +1054,18 @@ class ExtraTreesClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. + class_weight : dict, {class_label: weight} or "auto" or None, optional + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies. + + Note that this is only supported for single-output classification. + + Note that these weights will be multiplied with class_weight (passed + through the fit method) if sample_weight is specified + Attributes ---------- estimators_ : list of DecisionTreeClassifier @@ -1068,7 +1117,8 @@ def __init__(self, n_jobs=1, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + class_weight=None): super(ExtraTreesClassifier, self).__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1080,7 +1130,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + class_weight=class_weight) self.criterion = criterion self.max_depth = max_depth diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index b37d760f7eaf4..5cccc48760724 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -747,6 +747,53 @@ def test_1d_input(): yield check_1d_input, name, X, X_2d, y +def check_class_weights(name): + """Check class_weights resemble sample_weights behavior.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + # Iris is balanced, so no effect expected for using 'auto' weights + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target) + clf2 = ForestClassifier(class_weight='auto', random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight) + clf2 = ForestClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Check that sample_weight and class_weight are multiplicative + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight**2) + clf2 = ForestClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + +def test_class_weights(): + for name in FOREST_CLASSIFIERS: + yield check_class_weights, name + + +def check_class_weight_failure_multi_output(name): + """Test class_weight failure for multi-output""" + ForestClassifier = FOREST_CLASSIFIERS[name] + _y = np.vstack((y, np.array(y) * 2)).T + clf = ForestClassifier(class_weight='auto') + assert_raises(NotImplementedError, clf.fit, X, _y) + + +def test_class_weight_failure_multi_output(): + for name in FOREST_CLASSIFIERS: + yield check_class_weight_failure_multi_output, name + + def check_warm_start(name, random_state=42): """Test if fitting incrementally with warm start gives a forest of the right size and the same results as a normal fit.""" diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6dd457685e2a0..3a3c55aaed95d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -737,6 +737,8 @@ def check_class_weight_classifiers(name, Classifier): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) + if hasattr(classifier, "min_weight_fraction_leaf"): + classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) From b131ad23fab1416fed68931be2d1380448c23b20 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Sun, 14 Dec 2014 13:31:31 -0800 Subject: [PATCH 2/8] add multioutput support & bootstrap auto mode --- sklearn/ensemble/forest.py | 96 +++++++++++++++++++-------- sklearn/ensemble/tests/test_forest.py | 17 +++-- 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index e6acbb81fe11d..1b346d857f7ed 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -41,8 +41,6 @@ class calls the ``fit`` method of each sub-estimator on random samples from __future__ import division -import numpy as np - from warnings import warn from abc import ABCMeta, abstractmethod @@ -72,7 +70,7 @@ class calls the ``fit`` method of each sub-estimator on random samples def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0): + verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -89,6 +87,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts + if class_weight == 'bootstrap': + cw = [curr_sample_weight] + for k in range(y.shape[1]): + y_full = y[:, k] + classes_full = np.unique(y_full) + y_boot = y_full[indices] + classes_boot = np.unique(y_boot) + # Get class weights for the bootstrap sample + cw_part = compute_class_weight('auto', classes_boot, y_boot) + # Expand class weights to cover all classes in original y + # (in case some were missing from the bootstrap sample) + cw_part = np.array([cw_part[np.argwhere(classes_boot == w)] + if w in classes_boot + else 0. + for w in classes_full], dtype=np.float64) + # Expand weights over the original y for this output + cw_part = cw_part[np.searchsorted(classes_full, y_full)] + cw.append(cw_part) + # Multiply all weights by sample & bootstrap weights + curr_sample_weight = np.prod(cw, axis=0, dtype=np.float64) + tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) tree.indices_ = sample_counts > 0. @@ -267,7 +286,7 @@ def fit(self, X, y, sample_weight=None): backend="threading")( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose) + verbose=self.verbose, class_weight=self.class_weight) for i, t in enumerate(trees)) # Collect newly grown trees @@ -399,17 +418,26 @@ def _validate_y_cw(self, y_org): self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) - if self.class_weight is not None: - if self.n_outputs_ == 1: - cw = compute_class_weight(self.class_weight, - self.classes_[0], - y_org[:, 0]) - cw = cw[np.searchsorted(self.classes_[0], y_org[:, 0])] - else: - raise NotImplementedError('class_weights are not supported ' - 'for multi-output. You may use ' - 'sample_weights in the fit method ' - 'to weight by sample.') + if (self.class_weight is not None and + (self.class_weight != 'bootstrap' or not self.bootstrap)): + cw = [] + for k in range(self.n_outputs_): + if self.class_weight in ['auto', 'bootstrap']: + cw_part = compute_class_weight('auto', + self.classes_[k], + y_org[:, k]) + elif self.n_outputs_ == 1: + cw_part = compute_class_weight(self.class_weight, + self.classes_[k], + y_org[:, k]) + else: + cw_part = compute_class_weight(self.class_weight[k], + self.classes_[k], + y_org[:, k]) + cw_part = cw_part[np.searchsorted(self.classes_[k], + y_org[:, k])] + cw.append(cw_part) + cw = np.prod(cw, axis=0, dtype=np.float64) return y, cw @@ -730,17 +758,22 @@ class RandomForestClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. - class_weight : dict, {class_label: weight} or "auto" or None, optional - Weights associated with classes. If not given, all classes - are supposed to have weight one. + class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional + + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. For multi-output, the weights of each + column of y will be multiplied together. The "auto" mode uses the values of y to automatically adjust - weights inversely proportional to class frequencies. + weights inversely proportional to class frequencies in the input data. - Note that this is only supported for single-output classification. + The "bootstrap" mode is the same as "auto" except that weights are + computed based on the bootstrap sample for every tree grown. - Note that these weights will be multiplied with class_weight (passed - through the fit method) if sample_weight is specified + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. Attributes ---------- @@ -1054,17 +1087,22 @@ class ExtraTreesClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. - class_weight : dict, {class_label: weight} or "auto" or None, optional - Weights associated with classes. If not given, all classes - are supposed to have weight one. + class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional + + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. For multi-output, the weights of each + column of y will be multiplied together. The "auto" mode uses the values of y to automatically adjust - weights inversely proportional to class frequencies. + weights inversely proportional to class frequencies in the input data. - Note that this is only supported for single-output classification. + The "bootstrap" mode is the same as "auto" except that weights are + computed based on the bootstrap sample for every tree grown. - Note that these weights will be multiplied with class_weight (passed - through the fit method) if sample_weight is specified + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. Attributes ---------- diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 5cccc48760724..7407d272ecca8 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -781,17 +781,22 @@ def test_class_weights(): yield check_class_weights, name -def check_class_weight_failure_multi_output(name): - """Test class_weight failure for multi-output""" +def check_class_weight_auto_and_bootstrap_multi_output(name): + """Test class_weight works for multi-output""" ForestClassifier = FOREST_CLASSIFIERS[name] _y = np.vstack((y, np.array(y) * 2)).T - clf = ForestClassifier(class_weight='auto') - assert_raises(NotImplementedError, clf.fit, X, _y) + clf = ForestClassifier(class_weight='auto', random_state=0) + clf.fit(X, _y) + clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}], + random_state=0) + clf.fit(X, _y) + clf = ForestClassifier(class_weight='bootstrap', random_state=0) + clf.fit(X, _y) -def test_class_weight_failure_multi_output(): +def test_class_weight_auto_and_bootstrap_multi_output(): for name in FOREST_CLASSIFIERS: - yield check_class_weight_failure_multi_output, name + yield check_class_weight_auto_and_bootstrap_multi_output, name def check_warm_start(name, random_state=42): From 4c77ae897bf4b3bbbe499d8450acf55d14245f6a Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Sun, 14 Dec 2014 15:08:13 -0800 Subject: [PATCH 3/8] expanded class_weight dimension fix --- sklearn/ensemble/forest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 1b346d857f7ed..01286b4692efc 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -98,10 +98,10 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, cw_part = compute_class_weight('auto', classes_boot, y_boot) # Expand class weights to cover all classes in original y # (in case some were missing from the bootstrap sample) - cw_part = np.array([cw_part[np.argwhere(classes_boot == w)] + cw_part = np.array([cw_part[np.where(classes_boot == w)][0] if w in classes_boot else 0. - for w in classes_full], dtype=np.float64) + for w in classes_full]) # Expand weights over the original y for this output cw_part = cw_part[np.searchsorted(classes_full, y_full)] cw.append(cw_part) From 085d677ea2866e3e60d4b6323dfc8c4c9e841cb6 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Mon, 22 Dec 2014 16:50:58 -0800 Subject: [PATCH 4/8] add class_weight to trees, expand tests & minor refactor --- sklearn/ensemble/forest.py | 25 ++++++------ sklearn/ensemble/tests/test_forest.py | 16 +++++++- sklearn/tree/tests/test_tree.py | 55 +++++++++++++++++++++++++-- sklearn/tree/tree.py | 54 ++++++++++++++++++++++++-- 4 files changed, 127 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 01286b4692efc..23afed13e4c5b 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -423,17 +423,14 @@ def _validate_y_cw(self, y_org): cw = [] for k in range(self.n_outputs_): if self.class_weight in ['auto', 'bootstrap']: - cw_part = compute_class_weight('auto', - self.classes_[k], - y_org[:, k]) + class_weight_k = 'auto' elif self.n_outputs_ == 1: - cw_part = compute_class_weight(self.class_weight, - self.classes_[k], - y_org[:, k]) + class_weight_k = self.class_weight else: - cw_part = compute_class_weight(self.class_weight[k], - self.classes_[k], - y_org[:, k]) + class_weight_k = self.class_weight[k] + cw_part = compute_class_weight(class_weight_k, + self.classes_[k], + y_org[:, k]) cw_part = cw_part[np.searchsorted(self.classes_[k], y_org[:, k])] cw.append(cw_part) @@ -763,8 +760,7 @@ class RandomForestClassifier(ForestClassifier): Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same - order as the columns of y. For multi-output, the weights of each - column of y will be multiplied together. + order as the columns of y. The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. @@ -772,6 +768,8 @@ class RandomForestClassifier(ForestClassifier): The "bootstrap" mode is the same as "auto" except that weights are computed based on the bootstrap sample for every tree grown. + For multi-output, the weights of each column of y will be multiplied. + Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. @@ -1092,8 +1090,7 @@ class ExtraTreesClassifier(ForestClassifier): Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same - order as the columns of y. For multi-output, the weights of each - column of y will be multiplied together. + order as the columns of y. The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. @@ -1101,6 +1098,8 @@ class ExtraTreesClassifier(ForestClassifier): The "bootstrap" mode is the same as "auto" except that weights are computed based on the bootstrap sample for every tree grown. + For multi-output, the weights of each column of y will be multiplied. + Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 7407d272ecca8..ab834821d46fe 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -235,7 +235,6 @@ def test_unfitted_feature_importances(): yield check_unfitted_feature_importances, name - def check_oob_score(name, X, y, n_estimators=20): """Check that oob prediction is a good estimation of the generalization error.""" @@ -712,7 +711,6 @@ def check_memory_layout(name, dtype): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided X = np.asarray(iris.data[::3], dtype=dtype) y = iris.target[::3] @@ -758,6 +756,20 @@ def check_class_weights(name): clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + # Make a multi-output problem with three copies of Iris + iris_multi = np.vstack((iris.target, iris.target, iris.target)).T + # Create user-defined weights that should balance over the outputs + clf3 = ForestClassifier(class_weight=[{0: 2., 1: 2., 2: 1.}, + {0: 2., 1: 1., 2: 2.}, + {0: 1., 1: 2., 2: 2.}], + random_state=0) + clf3.fit(iris.data, iris_multi) + assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) + # Check against multi-output "auto" which should also have no effect + clf4 = ForestClassifier(class_weight='auto', random_state=0) + clf4.fit(iris.data, iris_multi) + assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) + # Inflate importance of class 1, check against user-defined weights sample_weight = np.ones(iris.target.shape) sample_weight[iris.target == 1] *= 100 diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index bd08fcdeadd55..bca4fafec8816 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -848,6 +848,54 @@ def test_sample_weight_invalid(): assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) +def check_class_weights(name): + """Check class_weights resemble sample_weights behavior.""" + TreeClassifier = CLF_TREES[name] + + # Iris is balanced, so no effect expected for using 'auto' weights + clf1 = TreeClassifier(random_state=0) + clf1.fit(iris.data, iris.target) + clf2 = TreeClassifier(class_weight='auto', random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Make a multi-output problem with three copies of Iris + iris_multi = np.vstack((iris.target, iris.target, iris.target)).T + # Create user-defined weights that should balance over the outputs + clf3 = TreeClassifier(class_weight=[{0: 2., 1: 2., 2: 1.}, + {0: 2., 1: 1., 2: 2.}, + {0: 1., 1: 2., 2: 2.}], + random_state=0) + clf3.fit(iris.data, iris_multi) + assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) + # Check against multi-output "auto" which should also have no effect + clf4 = TreeClassifier(class_weight='auto', random_state=0) + clf4.fit(iris.data, iris_multi) + assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + clf1 = TreeClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight) + clf2 = TreeClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Check that sample_weight and class_weight are multiplicative + clf1 = TreeClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight**2) + clf2 = TreeClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + +def test_class_weights(): + for name in CLF_TREES: + yield check_class_weights, name + + def test_max_leaf_nodes(): """Test greedy trees with max_depth + 1 leafs. """ from sklearn.tree._tree import TREE_LEAF @@ -988,7 +1036,7 @@ def check_sparse_input(tree, dataset, max_depth=None): if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X_sparse_test), - y_proba) + y_proba) assert_array_almost_equal(s.predict_log_proba(X_sparse_test), y_log_proba) @@ -1078,6 +1126,7 @@ def check_sparse_criterion(tree, dataset): "trees".format(tree)) assert_array_almost_equal(s.predict(X), d.predict(X)) + def test_sparse_criterion(): for tree, dataset in product(SPARSE_TREES, ["sparse-pos", "sparse-neg", "sparse-mix", @@ -1104,7 +1153,7 @@ def check_explicit_sparse_zeros(tree, max_depth=3, n_nonzero_i = random_state.binomial(n_samples, 0.5) indices_i = random_state.permutation(samples)[:n_nonzero_i] indices.append(indices_i) - data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1 + data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1 data.append(data_i) offset += n_nonzero_i indptr.append(offset) @@ -1195,5 +1244,3 @@ def check_min_weight_leaf_split_level(name): def test_min_weight_leaf_split_level(): for name in ALL_TREES: yield check_min_weight_leaf_split_level, name - - diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index bf6e2948c14f5..8a50cb66b7da9 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -25,7 +25,7 @@ from ..base import BaseEstimator, ClassifierMixin, RegressorMixin from ..externals import six from ..feature_selection.from_model import _LearntSelectorMixin -from ..utils import check_array, check_random_state +from ..utils import check_array, check_random_state, compute_class_weight from ._tree import Criterion @@ -80,7 +80,8 @@ def __init__(self, min_weight_fraction_leaf, max_features, max_leaf_nodes, - random_state): + random_state, + class_weight=None): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth @@ -90,6 +91,7 @@ def __init__(self, self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes + self.class_weight = class_weight self.n_features_ = None self.n_outputs_ = None @@ -145,6 +147,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) + cw = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -159,11 +162,29 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.classes_ = [] self.n_classes_ = [] + if self.class_weight is not None: + y_org = np.copy(y) + for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) + if self.class_weight is not None: + cw = [] + for k in range(self.n_outputs_): + if self.n_outputs_ == 1 or self.class_weight == 'auto': + class_weight_k = self.class_weight + else: + class_weight_k = self.class_weight[k] + cw_part = compute_class_weight(class_weight_k, + self.classes_[k], + y_org[:, k]) + cw_part = cw_part[np.searchsorted(self.classes_[k], + y_org[:, k])] + cw.append(cw_part) + cw = np.prod(cw, axis=0, dtype=np.float64) + else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ @@ -239,6 +260,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): "number of samples=%d" % (len(sample_weight), n_samples)) + if cw is not None: + if sample_weight is not None: + sample_weight *= cw + else: + sample_weight = cw + # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * @@ -427,6 +454,21 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + class_weight : dict, list of dicts, "auto" or None, optional + (default=None) + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data. + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -496,7 +538,8 @@ def __init__(self, min_weight_fraction_leaf=0., max_features=None, random_state=None, - max_leaf_nodes=None): + max_leaf_nodes=None, + class_weight=None): super(DecisionTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, @@ -506,6 +549,7 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, random_state=random_state) def predict_proba(self, X): @@ -749,7 +793,8 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, - max_leaf_nodes=None): + max_leaf_nodes=None, + class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, @@ -759,6 +804,7 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, random_state=random_state) From 2b24ccefa02153ae17648f45bd4142af0f98ce51 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Sat, 3 Jan 2015 13:15:42 -0800 Subject: [PATCH 5/8] parameter validation checks & tests for errors --- sklearn/ensemble/forest.py | 65 ++++++++++++++++++--------- sklearn/ensemble/tests/test_forest.py | 30 +++++++++++++ sklearn/tree/tests/test_tree.py | 24 ++++++++++ sklearn/tree/tree.py | 14 ++++++ 4 files changed, 113 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 23afed13e4c5b..2540f1647e930 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -98,10 +98,10 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, cw_part = compute_class_weight('auto', classes_boot, y_boot) # Expand class weights to cover all classes in original y # (in case some were missing from the bootstrap sample) - cw_part = np.array([cw_part[np.where(classes_boot == w)][0] - if w in classes_boot + cw_part = np.array([cw_part[np.where(classes_boot == c)][0] + if c in classes_boot else 0. - for w in classes_full]) + for c in classes_full]) # Expand weights over the original y for this output cw_part = cw_part[np.searchsorted(classes_full, y_full)] cw.append(cw_part) @@ -418,23 +418,48 @@ def _validate_y_cw(self, y_org): self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) - if (self.class_weight is not None and - (self.class_weight != 'bootstrap' or not self.bootstrap)): - cw = [] - for k in range(self.n_outputs_): - if self.class_weight in ['auto', 'bootstrap']: - class_weight_k = 'auto' - elif self.n_outputs_ == 1: - class_weight_k = self.class_weight - else: - class_weight_k = self.class_weight[k] - cw_part = compute_class_weight(class_weight_k, - self.classes_[k], - y_org[:, k]) - cw_part = cw_part[np.searchsorted(self.classes_[k], - y_org[:, k])] - cw.append(cw_part) - cw = np.prod(cw, axis=0, dtype=np.float64) + if self.class_weight is not None: + valid_presets = ['auto', 'bootstrap'] + if isinstance(self.class_weight, six.string_types): + if self.class_weight not in valid_presets: + raise ValueError('Valid presets for class_weight include ' + '"auto" and "bootstrap". Given "%s".' + % self.class_weight) + if self.warm_start: + warn('class_weight presets "auto" or "bootstrap" are ' + 'not recommended for warm_start if the fitted data ' + 'differs from the full dataset. In order to use ' + '"auto" weights, use compute_class_weight("auto", ' + 'classes, y). In place of y you can use a large ' + 'enough sample of the full training set target to ' + 'properly estimate the class frequency ' + 'distributions. Pass the resulting weights as the ' + 'class_weight parameter.') + elif self.n_outputs_ > 1: + if not hasattr(self.class_weight, "__iter__"): + raise ValueError("For multi-output, class_weight should " + "be a list of dicts, or a valid string.") + elif len(self.class_weight) != self.n_outputs_: + raise ValueError("For multi-output, number of elements " + "in class_weight should match number of " + "outputs.") + + if self.class_weight != 'bootstrap' or not self.bootstrap: + cw = [] + for k in range(self.n_outputs_): + if self.class_weight in valid_presets: + class_weight_k = 'auto' + elif self.n_outputs_ == 1: + class_weight_k = self.class_weight + else: + class_weight_k = self.class_weight[k] + cw_part = compute_class_weight(class_weight_k, + self.classes_[k], + y_org[:, k]) + cw_part = cw_part[np.searchsorted(self.classes_[k], + y_org[:, k])] + cw.append(cw_part) + cw = np.prod(cw, axis=0, dtype=np.float64) return y, cw diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index ab834821d46fe..6a96b6937d82b 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -811,6 +811,36 @@ def test_class_weight_auto_and_bootstrap_multi_output(): yield check_class_weight_auto_and_bootstrap_multi_output, name +def check_class_weight_errors(name): + """Test if class_weight raises errors and warnings when expected.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + _y = np.vstack((y, np.array(y) * 2)).T + + # Invalid preset string + clf = ForestClassifier(class_weight='the larch', random_state=0) + assert_raises(ValueError, clf.fit, X, y) + assert_raises(ValueError, clf.fit, X, _y) + + # Warning warm_start with preset + clf = ForestClassifier(class_weight='auto', warm_start=True, + random_state=0) + assert_warns(UserWarning, clf.fit, X, y) + assert_warns(UserWarning, clf.fit, X, _y) + + # Not a list or preset for multi-output + clf = ForestClassifier(class_weight=1, random_state=0) + assert_raises(ValueError, clf.fit, X, _y) + + # Incorrect length list for multi-output + clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0) + assert_raises(ValueError, clf.fit, X, _y) + + +def test_class_weight_errors(): + for name in FOREST_CLASSIFIERS: + yield check_class_weight_errors, name + + def check_warm_start(name, random_state=42): """Test if fitting incrementally with warm start gives a forest of the right size and the same results as a normal fit.""" diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index bca4fafec8816..c8f68b21bbe58 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -896,6 +896,30 @@ def test_class_weights(): yield check_class_weights, name +def check_class_weight_errors(name): + """Test if class_weight raises errors and warnings when expected.""" + TreeClassifier = CLF_TREES[name] + _y = np.vstack((y, np.array(y) * 2)).T + + # Invalid preset string + clf = TreeClassifier(class_weight='the larch', random_state=0) + assert_raises(ValueError, clf.fit, X, y) + assert_raises(ValueError, clf.fit, X, _y) + + # Not a list or preset for multi-output + clf = TreeClassifier(class_weight=1, random_state=0) + assert_raises(ValueError, clf.fit, X, _y) + + # Incorrect length list for multi-output + clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0) + assert_raises(ValueError, clf.fit, X, _y) + + +def test_class_weight_errors(): + for name in CLF_TREES: + yield check_class_weight_errors, name + + def test_max_leaf_nodes(): """Test greedy trees with max_depth + 1 leafs. """ from sklearn.tree._tree import TREE_LEAF diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 8a50cb66b7da9..f5100f8cc4d37 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -171,6 +171,20 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: + if isinstance(self.class_weight, six.string_types): + if self.class_weight != "auto": + raise ValueError('The only supported preset for ' + 'class_weight is "auto". Given "%s".' + % self.class_weight) + elif self.n_outputs_ > 1: + if not hasattr(self.class_weight, "__iter__"): + raise ValueError('For multi-output, class_weight ' + 'should be a list of dicts, or ' + '"auto".') + elif len(self.class_weight) != self.n_outputs_: + raise ValueError("For multi-output, number of " + "elements in class_weight should " + "match number of outputs.") cw = [] for k in range(self.n_outputs_): if self.n_outputs_ == 1 or self.class_weight == 'auto': From ac9ab961bb55d9e39834a521ac022aeb0157bc9d Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Thu, 8 Jan 2015 02:31:24 -0800 Subject: [PATCH 6/8] Y-org rename & whats_new update --- doc/whats_new.rst | 7 +++++++ sklearn/ensemble/forest.py | 8 ++++---- sklearn/tree/tree.py | 6 +++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 8816f72590a90..c5db9b5c7bbb5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -136,6 +136,11 @@ Enhancements - Add ``n_iter_`` attribute to estimators that accept a ``max_iter`` attribute in their constructor. By `Manoj Kumar`_. + - Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`ensemble.RandomForestClassifier`, + :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` + and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. + Documentation improvements .......................... @@ -3123,3 +3128,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Matt Terry: https://github.com/mrterry .. _Antony Lee: https://www.ocf.berkeley.edu/~antonyl/ + +.. _Trevor Stephens: http://trevorstephens.com/ diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 2540f1647e930..d9c1a2864294e 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -406,8 +406,8 @@ def _set_oob_score(self, X, y): self.oob_score_ = oob_score / self.n_outputs_ - def _validate_y_cw(self, y_org): - y = np.copy(y_org) + def _validate_y_cw(self, y_original): + y = np.copy(y_original) cw = None self.classes_ = [] @@ -455,9 +455,9 @@ def _validate_y_cw(self, y_org): class_weight_k = self.class_weight[k] cw_part = compute_class_weight(class_weight_k, self.classes_[k], - y_org[:, k]) + y_original[:, k]) cw_part = cw_part[np.searchsorted(self.classes_[k], - y_org[:, k])] + y_original[:, k])] cw.append(cw_part) cw = np.prod(cw, axis=0, dtype=np.float64) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index f5100f8cc4d37..9cbae9e76e43c 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -163,7 +163,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.n_classes_ = [] if self.class_weight is not None: - y_org = np.copy(y) + y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) @@ -193,9 +193,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): class_weight_k = self.class_weight[k] cw_part = compute_class_weight(class_weight_k, self.classes_[k], - y_org[:, k]) + y_original[:, k]) cw_part = cw_part[np.searchsorted(self.classes_[k], - y_org[:, k])] + y_original[:, k])] cw.append(cw_part) cw = np.prod(cw, axis=0, dtype=np.float64) From cad87b578cf8a91d4d59834452ff632819d60279 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Sat, 10 Jan 2015 11:49:42 -0800 Subject: [PATCH 7/8] rename vars & copy sample_weight --- sklearn/ensemble/forest.py | 61 +++++++++++++++++++++----------------- sklearn/tree/tree.py | 26 ++++++++-------- 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 3ff2abca16965..43c9866145b98 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -88,25 +88,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, curr_sample_weight *= sample_counts if class_weight == 'bootstrap': - cw = [curr_sample_weight] + expanded_class_weight = [curr_sample_weight] for k in range(y.shape[1]): y_full = y[:, k] classes_full = np.unique(y_full) y_boot = y_full[indices] classes_boot = np.unique(y_boot) # Get class weights for the bootstrap sample - cw_part = compute_class_weight('auto', classes_boot, y_boot) + weight_k = compute_class_weight('auto', classes_boot, y_boot) # Expand class weights to cover all classes in original y # (in case some were missing from the bootstrap sample) - cw_part = np.array([cw_part[np.where(classes_boot == c)][0] - if c in classes_boot - else 0. - for c in classes_full]) + weight_k = np.array([weight_k[np.where(classes_boot == c)][0] + if c in classes_boot + else 0. + for c in classes_full]) # Expand weights over the original y for this output - cw_part = cw_part[np.searchsorted(classes_full, y_full)] - cw.append(cw_part) + weight_k = weight_k[np.searchsorted(classes_full, y_full)] + expanded_class_weight.append(weight_k) # Multiply all weights by sample & bootstrap weights - curr_sample_weight = np.prod(cw, axis=0, dtype=np.float64) + curr_sample_weight = np.prod(expanded_class_weight, + axis=0, + dtype=np.float64) tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) @@ -232,16 +234,16 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] - y, cw = self._validate_y_cw(y) + y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) - if cw is not None: + if expanded_class_weight is not None: if sample_weight is not None: - sample_weight *= cw + sample_weight = np.copy(sample_weight) * expanded_class_weight else: - sample_weight = cw + sample_weight = expanded_class_weight # Check parameters self._validate_estimator() @@ -306,7 +308,7 @@ def fit(self, X, y, sample_weight=None): def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" - def _validate_y_cw(self, y): + def _validate_y_class_weight(self, y): # Default implementation return y, None @@ -406,9 +408,12 @@ def _set_oob_score(self, X, y): self.oob_score_ = oob_score / self.n_outputs_ - def _validate_y_cw(self, y_original): - y = np.copy(y_original) - cw = None + def _validate_y_class_weight(self, y): + y = np.copy(y) + expanded_class_weight = None + + if self.class_weight is not None: + y_original = np.copy(y) self.classes_ = [] self.n_classes_ = [] @@ -445,7 +450,7 @@ def _validate_y_cw(self, y_original): "outputs.") if self.class_weight != 'bootstrap' or not self.bootstrap: - cw = [] + expanded_class_weight = [] for k in range(self.n_outputs_): if self.class_weight in valid_presets: class_weight_k = 'auto' @@ -453,15 +458,17 @@ def _validate_y_cw(self, y_original): class_weight_k = self.class_weight else: class_weight_k = self.class_weight[k] - cw_part = compute_class_weight(class_weight_k, - self.classes_[k], - y_original[:, k]) - cw_part = cw_part[np.searchsorted(self.classes_[k], - y_original[:, k])] - cw.append(cw_part) - cw = np.prod(cw, axis=0, dtype=np.float64) - - return y, cw + weight_k = compute_class_weight(class_weight_k, + self.classes_[k], + y_original[:, k]) + weight_k = weight_k[np.searchsorted(self.classes_[k], + y_original[:, k])] + expanded_class_weight.append(weight_k) + expanded_class_weight = np.prod(expanded_class_weight, + axis=0, + dtype=np.float64) + + return y, expanded_class_weight def predict(self, X): """Predict class for X. diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index f90fa582cba68..db53f481f6434 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -147,7 +147,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) - cw = None + expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -185,19 +185,21 @@ def fit(self, X, y, sample_weight=None, check_input=True): raise ValueError("For multi-output, number of " "elements in class_weight should " "match number of outputs.") - cw = [] + expanded_class_weight = [] for k in range(self.n_outputs_): if self.n_outputs_ == 1 or self.class_weight == 'auto': class_weight_k = self.class_weight else: class_weight_k = self.class_weight[k] - cw_part = compute_class_weight(class_weight_k, - self.classes_[k], - y_original[:, k]) - cw_part = cw_part[np.searchsorted(self.classes_[k], - y_original[:, k])] - cw.append(cw_part) - cw = np.prod(cw, axis=0, dtype=np.float64) + weight_k = compute_class_weight(class_weight_k, + self.classes_[k], + y_original[:, k]) + weight_k = weight_k[np.searchsorted(self.classes_[k], + y_original[:, k])] + expanded_class_weight.append(weight_k) + expanded_class_weight = np.prod(expanded_class_weight, + axis=0, + dtype=np.float64) else: self.classes_ = [None] * self.n_outputs_ @@ -274,11 +276,11 @@ def fit(self, X, y, sample_weight=None, check_input=True): "number of samples=%d" % (len(sample_weight), n_samples)) - if cw is not None: + if expanded_class_weight is not None: if sample_weight is not None: - sample_weight *= cw + sample_weight = np.copy(sample_weight) * expanded_class_weight else: - sample_weight = cw + sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: From 35c2535bfc838db31a14b4da9d0f4bd195b9e252 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Mon, 12 Jan 2015 21:27:13 -0800 Subject: [PATCH 8/8] rename cw option to subsample & refactor its implementation --- sklearn/ensemble/forest.py | 41 ++++++++++++++------------- sklearn/ensemble/tests/test_forest.py | 2 +- sklearn/tree/tree.py | 2 +- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index f383964065402..076df001b9899 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -87,24 +87,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts - if class_weight == 'bootstrap': + if class_weight == 'subsample': + expanded_class_weight = [curr_sample_weight] + for k in range(y.shape[1]): y_full = y[:, k] classes_full = np.unique(y_full) - y_boot = y_full[indices] + y_boot = y[indices, k] classes_boot = np.unique(y_boot) - # Get class weights for the bootstrap sample - weight_k = compute_class_weight('auto', classes_boot, y_boot) - # Expand class weights to cover all classes in original y - # (in case some were missing from the bootstrap sample) - weight_k = np.array([weight_k[np.where(classes_boot == c)][0] - if c in classes_boot - else 0. - for c in classes_full]) + + # Get class weights for the bootstrap sample, covering all + # classes in case some were missing from the bootstrap sample + weight_k = np.choose( + np.searchsorted(classes_boot, classes_full), + compute_class_weight('auto', classes_boot, y_boot), + mode='clip') + # Expand weights over the original y for this output weight_k = weight_k[np.searchsorted(classes_full, y_full)] expanded_class_weight.append(weight_k) + # Multiply all weights by sample & bootstrap weights curr_sample_weight = np.prod(expanded_class_weight, axis=0, @@ -243,7 +246,7 @@ def fit(self, X, y, sample_weight=None): if expanded_class_weight is not None: if sample_weight is not None: - sample_weight = np.copy(sample_weight) * expanded_class_weight + sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight @@ -428,14 +431,14 @@ def _validate_y_class_weight(self, y): self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: - valid_presets = ['auto', 'bootstrap'] + valid_presets = ('auto', 'subsample') if isinstance(self.class_weight, six.string_types): if self.class_weight not in valid_presets: raise ValueError('Valid presets for class_weight include ' - '"auto" and "bootstrap". Given "%s".' + '"auto" and "subsample". Given "%s".' % self.class_weight) if self.warm_start: - warn('class_weight presets "auto" or "bootstrap" are ' + warn('class_weight presets "auto" or "subsample" are ' 'not recommended for warm_start if the fitted data ' 'differs from the full dataset. In order to use ' '"auto" weights, use compute_class_weight("auto", ' @@ -453,7 +456,7 @@ def _validate_y_class_weight(self, y): "in class_weight should match number of " "outputs.") - if self.class_weight != 'bootstrap' or not self.bootstrap: + if self.class_weight != 'subsample' or not self.bootstrap: expanded_class_weight = [] for k in range(self.n_outputs_): if self.class_weight in valid_presets: @@ -797,7 +800,7 @@ class RandomForestClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. - class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional + class_weight : dict, list of dicts, "auto", "subsample" or None, optional Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For @@ -807,7 +810,7 @@ class RandomForestClassifier(ForestClassifier): The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. - The "bootstrap" mode is the same as "auto" except that weights are + The "subsample" mode is the same as "auto" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. @@ -1127,7 +1130,7 @@ class ExtraTreesClassifier(ForestClassifier): and add more estimators to the ensemble, otherwise, just fit a whole new forest. - class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional + class_weight : dict, list of dicts, "auto", "subsample" or None, optional Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For @@ -1137,7 +1140,7 @@ class ExtraTreesClassifier(ForestClassifier): The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. - The "bootstrap" mode is the same as "auto" except that weights are + The "subsample" mode is the same as "auto" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 6a96b6937d82b..cd0697af20500 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -802,7 +802,7 @@ def check_class_weight_auto_and_bootstrap_multi_output(name): clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}], random_state=0) clf.fit(X, _y) - clf = ForestClassifier(class_weight='bootstrap', random_state=0) + clf = ForestClassifier(class_weight='subsample', random_state=0) clf.fit(X, _y) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 64989a356a75f..6dd0cf2999bcc 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -279,7 +279,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if expanded_class_weight is not None: if sample_weight is not None: - sample_weight = np.copy(sample_weight) * expanded_class_weight + sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight