From de35beab435777bc033da42102ec2c1459235104 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Thu, 11 Dec 2014 16:22:26 -0800
Subject: [PATCH 1/8] add support for class_weights

---
 sklearn/ensemble/forest.py            | 79 ++++++++++++++++++++++-----
 sklearn/ensemble/tests/test_forest.py | 47 ++++++++++++++++
 sklearn/utils/estimator_checks.py     |  2 +
 3 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index c370f6bc87340..e6acbb81fe11d 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -58,7 +58,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                     ExtraTreeClassifier, ExtraTreeRegressor)
 from ..tree._tree import DTYPE, DOUBLE
-from ..utils import check_random_state, check_array
+from ..utils import check_random_state, check_array, compute_class_weight
 from ..utils.validation import DataConversionWarning
 from .base import BaseEnsemble, _partition_estimators
 
@@ -122,7 +122,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(BaseForest, self).__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators,
@@ -134,6 +135,7 @@ def __init__(self,
         self.random_state = random_state
         self.verbose = verbose
         self.warm_start = warm_start
+        self.class_weight = class_weight
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -211,11 +213,17 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        y = self._validate_y(y)
+        y, cw = self._validate_y_cw(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
+        if cw is not None:
+            if sample_weight is not None:
+                sample_weight *= cw
+            else:
+                sample_weight = cw
+
         # Check parameters
         self._validate_estimator()
 
@@ -279,9 +287,9 @@ def fit(self, X, y, sample_weight=None):
     def _set_oob_score(self, X, y):
         """Calculate out of bag predictions and score."""
 
-    def _validate_y(self, y):
+    def _validate_y_cw(self, y):
         # Default implementation
-        return y
+        return y, None
 
     @property
     def feature_importances_(self):
@@ -320,7 +328,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
 
         super(ForestClassifier, self).__init__(
             base_estimator,
@@ -331,7 +340,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag score"""
@@ -377,8 +387,9 @@ def _set_oob_score(self, X, y):
 
         self.oob_score_ = oob_score / self.n_outputs_
 
-    def _validate_y(self, y):
-        y = np.copy(y)
+    def _validate_y_cw(self, y_org):
+        y = np.copy(y_org)
+        cw = None
 
         self.classes_ = []
         self.n_classes_ = []
@@ -388,7 +399,19 @@ def _validate_y(self, y):
             self.classes_.append(classes_k)
             self.n_classes_.append(classes_k.shape[0])
 
-        return y
+        if self.class_weight is not None:
+            if self.n_outputs_ == 1:
+                cw = compute_class_weight(self.class_weight,
+                                          self.classes_[0],
+                                          y_org[:, 0])
+                cw = cw[np.searchsorted(self.classes_[0], y_org[:, 0])]
+            else:
+                raise NotImplementedError('class_weights are not supported '
+                                          'for multi-output. You may use '
+                                          'sample_weights in the fit method '
+                                          'to weight by sample.')
+
+        return y, cw
 
     def predict(self, X):
         """Predict class for X.
@@ -707,6 +730,18 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    class_weight : dict, {class_label: weight} or "auto" or None, optional
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies.
+
+        Note that this is only supported for single-output classification.
+
+        Note that these weights will be multiplied with class_weight (passed
+        through the fit method) if sample_weight is specified
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -755,7 +790,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(RandomForestClassifier, self).__init__(
             base_estimator=DecisionTreeClassifier(),
             n_estimators=n_estimators,
@@ -768,7 +804,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1017,6 +1054,18 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    class_weight : dict, {class_label: weight} or "auto" or None, optional
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies.
+
+        Note that this is only supported for single-output classification.
+
+        Note that these weights will be multiplied with class_weight (passed
+        through the fit method) if sample_weight is specified
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -1068,7 +1117,8 @@ def __init__(self,
                  n_jobs=1,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 class_weight=None):
         super(ExtraTreesClassifier, self).__init__(
             base_estimator=ExtraTreeClassifier(),
             n_estimators=n_estimators,
@@ -1080,7 +1130,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            class_weight=class_weight)
 
         self.criterion = criterion
         self.max_depth = max_depth
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index b37d760f7eaf4..5cccc48760724 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -747,6 +747,53 @@ def test_1d_input():
         yield check_1d_input, name, X, X_2d, y
 
 
+def check_class_weights(name):
+    """Check class_weights resemble sample_weights behavior."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    # Iris is balanced, so no effect expected for using 'auto' weights
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target)
+    clf2 = ForestClassifier(class_weight='auto', random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1., 1: 100., 2: 1.}
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf1 = ForestClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+
+def test_class_weights():
+    for name in FOREST_CLASSIFIERS:
+        yield check_class_weights, name
+
+
+def check_class_weight_failure_multi_output(name):
+    """Test class_weight failure for multi-output"""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+    clf = ForestClassifier(class_weight='auto')
+    assert_raises(NotImplementedError, clf.fit, X, _y)
+
+
+def test_class_weight_failure_multi_output():
+    for name in FOREST_CLASSIFIERS:
+        yield check_class_weight_failure_multi_output, name
+
+
 def check_warm_start(name, random_state=42):
     """Test if fitting incrementally with warm start gives a forest of the
     right size and the same results as a normal fit."""
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 6dd457685e2a0..3a3c55aaed95d 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -737,6 +737,8 @@ def check_class_weight_classifiers(name, Classifier):
             classifier = Classifier(class_weight=class_weight)
         if hasattr(classifier, "n_iter"):
             classifier.set_params(n_iter=100)
+        if hasattr(classifier, "min_weight_fraction_leaf"):
+            classifier.set_params(min_weight_fraction_leaf=0.01)
 
         set_random_state(classifier)
         classifier.fit(X_train, y_train)

From b131ad23fab1416fed68931be2d1380448c23b20 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 14 Dec 2014 13:31:31 -0800
Subject: [PATCH 2/8] add multioutput support & bootstrap auto mode

---
 sklearn/ensemble/forest.py            | 96 +++++++++++++++++++--------
 sklearn/ensemble/tests/test_forest.py | 17 +++--
 2 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index e6acbb81fe11d..1b346d857f7ed 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -41,8 +41,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 from __future__ import division
 
-import numpy as np
-
 from warnings import warn
 from abc import ABCMeta, abstractmethod
 
@@ -72,7 +70,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 
 def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
-                          verbose=0):
+                          verbose=0, class_weight=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -89,6 +87,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         sample_counts = np.bincount(indices, minlength=n_samples)
         curr_sample_weight *= sample_counts
 
+        if class_weight == 'bootstrap':
+            cw = [curr_sample_weight]
+            for k in range(y.shape[1]):
+                y_full = y[:, k]
+                classes_full = np.unique(y_full)
+                y_boot = y_full[indices]
+                classes_boot = np.unique(y_boot)
+                # Get class weights for the bootstrap sample
+                cw_part = compute_class_weight('auto', classes_boot, y_boot)
+                # Expand class weights to cover all classes in original y
+                # (in case some were missing from the bootstrap sample)
+                cw_part = np.array([cw_part[np.argwhere(classes_boot == w)]
+                                    if w in classes_boot
+                                    else 0.
+                                    for w in classes_full], dtype=np.float64)
+                # Expand weights over the original y for this output
+                cw_part = cw_part[np.searchsorted(classes_full, y_full)]
+                cw.append(cw_part)
+            # Multiply all weights by sample & bootstrap weights
+            curr_sample_weight = np.prod(cw, axis=0, dtype=np.float64)
+
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
 
         tree.indices_ = sample_counts > 0.
@@ -267,7 +286,7 @@ def fit(self, X, y, sample_weight=None):
                              backend="threading")(
                 delayed(_parallel_build_trees)(
                     t, self, X, y, sample_weight, i, len(trees),
-                    verbose=self.verbose)
+                    verbose=self.verbose, class_weight=self.class_weight)
                 for i, t in enumerate(trees))
 
             # Collect newly grown trees
@@ -399,17 +418,26 @@ def _validate_y_cw(self, y_org):
             self.classes_.append(classes_k)
             self.n_classes_.append(classes_k.shape[0])
 
-        if self.class_weight is not None:
-            if self.n_outputs_ == 1:
-                cw = compute_class_weight(self.class_weight,
-                                          self.classes_[0],
-                                          y_org[:, 0])
-                cw = cw[np.searchsorted(self.classes_[0], y_org[:, 0])]
-            else:
-                raise NotImplementedError('class_weights are not supported '
-                                          'for multi-output. You may use '
-                                          'sample_weights in the fit method '
-                                          'to weight by sample.')
+        if (self.class_weight is not None and
+                (self.class_weight != 'bootstrap' or not self.bootstrap)):
+            cw = []
+            for k in range(self.n_outputs_):
+                if self.class_weight in ['auto', 'bootstrap']:
+                    cw_part = compute_class_weight('auto',
+                                                   self.classes_[k],
+                                                   y_org[:, k])
+                elif self.n_outputs_ == 1:
+                    cw_part = compute_class_weight(self.class_weight,
+                                                   self.classes_[k],
+                                                   y_org[:, k])
+                else:
+                    cw_part = compute_class_weight(self.class_weight[k],
+                                                   self.classes_[k],
+                                                   y_org[:, k])
+                cw_part = cw_part[np.searchsorted(self.classes_[k],
+                                                  y_org[:, k])]
+                cw.append(cw_part)
+            cw = np.prod(cw, axis=0, dtype=np.float64)
 
         return y, cw
 
@@ -730,17 +758,22 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, {class_label: weight} or "auto" or None, optional
-        Weights associated with classes. If not given, all classes
-        are supposed to have weight one.
+    class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional
+
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y. For multi-output, the weights of each
+        column of y will be multiplied together.
 
         The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies.
+        weights inversely proportional to class frequencies in the input data.
 
-        Note that this is only supported for single-output classification.
+        The "bootstrap" mode is the same as "auto" except that weights are
+        computed based on the bootstrap sample for every tree grown.
 
-        Note that these weights will be multiplied with class_weight (passed
-        through the fit method) if sample_weight is specified
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
 
     Attributes
     ----------
@@ -1054,17 +1087,22 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, {class_label: weight} or "auto" or None, optional
-        Weights associated with classes. If not given, all classes
-        are supposed to have weight one.
+    class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional
+
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y. For multi-output, the weights of each
+        column of y will be multiplied together.
 
         The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies.
+        weights inversely proportional to class frequencies in the input data.
 
-        Note that this is only supported for single-output classification.
+        The "bootstrap" mode is the same as "auto" except that weights are
+        computed based on the bootstrap sample for every tree grown.
 
-        Note that these weights will be multiplied with class_weight (passed
-        through the fit method) if sample_weight is specified
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
 
     Attributes
     ----------
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 5cccc48760724..7407d272ecca8 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -781,17 +781,22 @@ def test_class_weights():
         yield check_class_weights, name
 
 
-def check_class_weight_failure_multi_output(name):
-    """Test class_weight failure for multi-output"""
+def check_class_weight_auto_and_bootstrap_multi_output(name):
+    """Test class_weight works for multi-output"""
     ForestClassifier = FOREST_CLASSIFIERS[name]
     _y = np.vstack((y, np.array(y) * 2)).T
-    clf = ForestClassifier(class_weight='auto')
-    assert_raises(NotImplementedError, clf.fit, X, _y)
+    clf = ForestClassifier(class_weight='auto', random_state=0)
+    clf.fit(X, _y)
+    clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
+                           random_state=0)
+    clf.fit(X, _y)
+    clf = ForestClassifier(class_weight='bootstrap', random_state=0)
+    clf.fit(X, _y)
 
 
-def test_class_weight_failure_multi_output():
+def test_class_weight_auto_and_bootstrap_multi_output():
     for name in FOREST_CLASSIFIERS:
-        yield check_class_weight_failure_multi_output, name
+        yield check_class_weight_auto_and_bootstrap_multi_output, name
 
 
 def check_warm_start(name, random_state=42):

From 4c77ae897bf4b3bbbe499d8450acf55d14245f6a Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 14 Dec 2014 15:08:13 -0800
Subject: [PATCH 3/8] expanded class_weight dimension fix

---
 sklearn/ensemble/forest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 1b346d857f7ed..01286b4692efc 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -98,10 +98,10 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
                 cw_part = compute_class_weight('auto', classes_boot, y_boot)
                 # Expand class weights to cover all classes in original y
                 # (in case some were missing from the bootstrap sample)
-                cw_part = np.array([cw_part[np.argwhere(classes_boot == w)]
+                cw_part = np.array([cw_part[np.where(classes_boot == w)][0]
                                     if w in classes_boot
                                     else 0.
-                                    for w in classes_full], dtype=np.float64)
+                                    for w in classes_full])
                 # Expand weights over the original y for this output
                 cw_part = cw_part[np.searchsorted(classes_full, y_full)]
                 cw.append(cw_part)

From 085d677ea2866e3e60d4b6323dfc8c4c9e841cb6 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 22 Dec 2014 16:50:58 -0800
Subject: [PATCH 4/8] add class_weight to trees, expand tests & minor refactor

---
 sklearn/ensemble/forest.py            | 25 ++++++------
 sklearn/ensemble/tests/test_forest.py | 16 +++++++-
 sklearn/tree/tests/test_tree.py       | 55 +++++++++++++++++++++++++--
 sklearn/tree/tree.py                  | 54 ++++++++++++++++++++++++--
 4 files changed, 127 insertions(+), 23 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 01286b4692efc..23afed13e4c5b 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -423,17 +423,14 @@ def _validate_y_cw(self, y_org):
             cw = []
             for k in range(self.n_outputs_):
                 if self.class_weight in ['auto', 'bootstrap']:
-                    cw_part = compute_class_weight('auto',
-                                                   self.classes_[k],
-                                                   y_org[:, k])
+                    class_weight_k = 'auto'
                 elif self.n_outputs_ == 1:
-                    cw_part = compute_class_weight(self.class_weight,
-                                                   self.classes_[k],
-                                                   y_org[:, k])
+                    class_weight_k = self.class_weight
                 else:
-                    cw_part = compute_class_weight(self.class_weight[k],
-                                                   self.classes_[k],
-                                                   y_org[:, k])
+                    class_weight_k = self.class_weight[k]
+                cw_part = compute_class_weight(class_weight_k,
+                                               self.classes_[k],
+                                               y_org[:, k])
                 cw_part = cw_part[np.searchsorted(self.classes_[k],
                                                   y_org[:, k])]
                 cw.append(cw_part)
@@ -763,8 +760,7 @@ class RandomForestClassifier(ForestClassifier):
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
-        order as the columns of y. For multi-output, the weights of each
-        column of y will be multiplied together.
+        order as the columns of y.
 
         The "auto" mode uses the values of y to automatically adjust
         weights inversely proportional to class frequencies in the input data.
@@ -772,6 +768,8 @@ class RandomForestClassifier(ForestClassifier):
         The "bootstrap" mode is the same as "auto" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
+        For multi-output, the weights of each column of y will be multiplied.
+
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
@@ -1092,8 +1090,7 @@ class ExtraTreesClassifier(ForestClassifier):
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
-        order as the columns of y. For multi-output, the weights of each
-        column of y will be multiplied together.
+        order as the columns of y.
 
         The "auto" mode uses the values of y to automatically adjust
         weights inversely proportional to class frequencies in the input data.
@@ -1101,6 +1098,8 @@ class ExtraTreesClassifier(ForestClassifier):
         The "bootstrap" mode is the same as "auto" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
+        For multi-output, the weights of each column of y will be multiplied.
+
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 7407d272ecca8..ab834821d46fe 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -235,7 +235,6 @@ def test_unfitted_feature_importances():
         yield check_unfitted_feature_importances, name
 
 
-
 def check_oob_score(name, X, y, n_estimators=20):
     """Check that oob prediction is a good estimation of the generalization
        error."""
@@ -712,7 +711,6 @@ def check_memory_layout(name, dtype):
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
-
     # Strided
     X = np.asarray(iris.data[::3], dtype=dtype)
     y = iris.target[::3]
@@ -758,6 +756,20 @@ def check_class_weights(name):
     clf2.fit(iris.data, iris.target)
     assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
 
+    # Make a multi-output problem with three copies of Iris
+    iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
+    # Create user-defined weights that should balance over the outputs
+    clf3 = ForestClassifier(class_weight=[{0: 2., 1: 2., 2: 1.},
+                                          {0: 2., 1: 1., 2: 2.},
+                                          {0: 1., 1: 2., 2: 2.}],
+                            random_state=0)
+    clf3.fit(iris.data, iris_multi)
+    assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
+    # Check against multi-output "auto" which should also have no effect
+    clf4 = ForestClassifier(class_weight='auto', random_state=0)
+    clf4.fit(iris.data, iris_multi)
+    assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
+
     # Inflate importance of class 1, check against user-defined weights
     sample_weight = np.ones(iris.target.shape)
     sample_weight[iris.target == 1] *= 100
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index bd08fcdeadd55..bca4fafec8816 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -848,6 +848,54 @@ def test_sample_weight_invalid():
     assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)
 
 
+def check_class_weights(name):
+    """Check class_weights resemble sample_weights behavior."""
+    TreeClassifier = CLF_TREES[name]
+
+    # Iris is balanced, so no effect expected for using 'auto' weights
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target)
+    clf2 = TreeClassifier(class_weight='auto', random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Make a multi-output problem with three copies of Iris
+    iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
+    # Create user-defined weights that should balance over the outputs
+    clf3 = TreeClassifier(class_weight=[{0: 2., 1: 2., 2: 1.},
+                                        {0: 2., 1: 1., 2: 2.},
+                                        {0: 1., 1: 2., 2: 2.}],
+                          random_state=0)
+    clf3.fit(iris.data, iris_multi)
+    assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
+    # Check against multi-output "auto" which should also have no effect
+    clf4 = TreeClassifier(class_weight='auto', random_state=0)
+    clf4.fit(iris.data, iris_multi)
+    assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
+
+    # Inflate importance of class 1, check against user-defined weights
+    sample_weight = np.ones(iris.target.shape)
+    sample_weight[iris.target == 1] *= 100
+    class_weight = {0: 1., 1: 100., 2: 1.}
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight)
+    clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf1 = TreeClassifier(random_state=0)
+    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
+    clf2.fit(iris.data, iris.target, sample_weight)
+    assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
+
+
+def test_class_weights():
+    for name in CLF_TREES:
+        yield check_class_weights, name
+
+
 def test_max_leaf_nodes():
     """Test greedy trees with max_depth + 1 leafs. """
     from sklearn.tree._tree import TREE_LEAF
@@ -988,7 +1036,7 @@ def check_sparse_input(tree, dataset, max_depth=None):
 
             if tree in CLF_TREES:
                 assert_array_almost_equal(s.predict_proba(X_sparse_test),
-                                                          y_proba)
+                                          y_proba)
                 assert_array_almost_equal(s.predict_log_proba(X_sparse_test),
                                           y_log_proba)
 
@@ -1078,6 +1126,7 @@ def check_sparse_criterion(tree, dataset):
                           "trees".format(tree))
         assert_array_almost_equal(s.predict(X), d.predict(X))
 
+
 def test_sparse_criterion():
     for tree, dataset in product(SPARSE_TREES,
                                  ["sparse-pos", "sparse-neg", "sparse-mix",
@@ -1104,7 +1153,7 @@ def check_explicit_sparse_zeros(tree, max_depth=3,
         n_nonzero_i = random_state.binomial(n_samples, 0.5)
         indices_i = random_state.permutation(samples)[:n_nonzero_i]
         indices.append(indices_i)
-        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, ))  - 1
+        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
         data.append(data_i)
         offset += n_nonzero_i
         indptr.append(offset)
@@ -1195,5 +1244,3 @@ def check_min_weight_leaf_split_level(name):
 def test_min_weight_leaf_split_level():
     for name in ALL_TREES:
         yield check_min_weight_leaf_split_level, name
-
-
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index bf6e2948c14f5..8a50cb66b7da9 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -25,7 +25,7 @@
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
 from ..externals import six
 from ..feature_selection.from_model import _LearntSelectorMixin
-from ..utils import check_array, check_random_state
+from ..utils import check_array, check_random_state, compute_class_weight
 
 
 from ._tree import Criterion
@@ -80,7 +80,8 @@ def __init__(self,
                  min_weight_fraction_leaf,
                  max_features,
                  max_leaf_nodes,
-                 random_state):
+                 random_state,
+                 class_weight=None):
         self.criterion = criterion
         self.splitter = splitter
         self.max_depth = max_depth
@@ -90,6 +91,7 @@ def __init__(self,
         self.max_features = max_features
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
+        self.class_weight = class_weight
 
         self.n_features_ = None
         self.n_outputs_ = None
@@ -145,6 +147,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         is_classification = isinstance(self, ClassifierMixin)
 
         y = np.atleast_1d(y)
+        cw = None
 
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
@@ -159,11 +162,29 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             self.classes_ = []
             self.n_classes_ = []
 
+            if self.class_weight is not None:
+                y_org = np.copy(y)
+
             for k in range(self.n_outputs_):
                 classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
                 self.classes_.append(classes_k)
                 self.n_classes_.append(classes_k.shape[0])
 
+            if self.class_weight is not None:
+                cw = []
+                for k in range(self.n_outputs_):
+                    if self.n_outputs_ == 1 or self.class_weight == 'auto':
+                        class_weight_k = self.class_weight
+                    else:
+                        class_weight_k = self.class_weight[k]
+                    cw_part = compute_class_weight(class_weight_k,
+                                                   self.classes_[k],
+                                                   y_org[:, k])
+                    cw_part = cw_part[np.searchsorted(self.classes_[k],
+                                                      y_org[:, k])]
+                    cw.append(cw_part)
+                cw = np.prod(cw, axis=0, dtype=np.float64)
+
         else:
             self.classes_ = [None] * self.n_outputs_
             self.n_classes_ = [1] * self.n_outputs_
@@ -239,6 +260,12 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                                  "number of samples=%d" %
                                  (len(sample_weight), n_samples))
 
+        if cw is not None:
+            if sample_weight is not None:
+                sample_weight *= cw
+            else:
+                sample_weight = cw
+
         # Set min_weight_leaf from min_weight_fraction_leaf
         if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
             min_weight_leaf = (self.min_weight_fraction_leaf *
@@ -427,6 +454,21 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
+    class_weight : dict, list of dicts, "auto" or None, optional
+                   (default=None)
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        The "auto" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data.
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -496,7 +538,8 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features=None,
                  random_state=None,
-                 max_leaf_nodes=None):
+                 max_leaf_nodes=None,
+                 class_weight=None):
         super(DecisionTreeClassifier, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -506,6 +549,7 @@ def __init__(self,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
+            class_weight=class_weight,
             random_state=random_state)
 
     def predict_proba(self, X):
@@ -749,7 +793,8 @@ def __init__(self,
                  min_weight_fraction_leaf=0.,
                  max_features="auto",
                  random_state=None,
-                 max_leaf_nodes=None):
+                 max_leaf_nodes=None,
+                 class_weight=None):
         super(ExtraTreeClassifier, self).__init__(
             criterion=criterion,
             splitter=splitter,
@@ -759,6 +804,7 @@ def __init__(self,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
+            class_weight=class_weight,
             random_state=random_state)
 
 

From 2b24ccefa02153ae17648f45bd4142af0f98ce51 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 3 Jan 2015 13:15:42 -0800
Subject: [PATCH 5/8] parameter validation checks & tests for errors

---
 sklearn/ensemble/forest.py            | 65 ++++++++++++++++++---------
 sklearn/ensemble/tests/test_forest.py | 30 +++++++++++++
 sklearn/tree/tests/test_tree.py       | 24 ++++++++++
 sklearn/tree/tree.py                  | 14 ++++++
 4 files changed, 113 insertions(+), 20 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 23afed13e4c5b..2540f1647e930 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -98,10 +98,10 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
                 cw_part = compute_class_weight('auto', classes_boot, y_boot)
                 # Expand class weights to cover all classes in original y
                 # (in case some were missing from the bootstrap sample)
-                cw_part = np.array([cw_part[np.where(classes_boot == w)][0]
-                                    if w in classes_boot
+                cw_part = np.array([cw_part[np.where(classes_boot == c)][0]
+                                    if c in classes_boot
                                     else 0.
-                                    for w in classes_full])
+                                    for c in classes_full])
                 # Expand weights over the original y for this output
                 cw_part = cw_part[np.searchsorted(classes_full, y_full)]
                 cw.append(cw_part)
@@ -418,23 +418,48 @@ def _validate_y_cw(self, y_org):
             self.classes_.append(classes_k)
             self.n_classes_.append(classes_k.shape[0])
 
-        if (self.class_weight is not None and
-                (self.class_weight != 'bootstrap' or not self.bootstrap)):
-            cw = []
-            for k in range(self.n_outputs_):
-                if self.class_weight in ['auto', 'bootstrap']:
-                    class_weight_k = 'auto'
-                elif self.n_outputs_ == 1:
-                    class_weight_k = self.class_weight
-                else:
-                    class_weight_k = self.class_weight[k]
-                cw_part = compute_class_weight(class_weight_k,
-                                               self.classes_[k],
-                                               y_org[:, k])
-                cw_part = cw_part[np.searchsorted(self.classes_[k],
-                                                  y_org[:, k])]
-                cw.append(cw_part)
-            cw = np.prod(cw, axis=0, dtype=np.float64)
+        if self.class_weight is not None:
+            valid_presets = ['auto', 'bootstrap']
+            if isinstance(self.class_weight, six.string_types):
+                if self.class_weight not in valid_presets:
+                    raise ValueError('Valid presets for class_weight include '
+                                     '"auto" and "bootstrap". Given "%s".'
+                                     % self.class_weight)
+                if self.warm_start:
+                    warn('class_weight presets "auto" or "bootstrap" are '
+                         'not recommended for warm_start if the fitted data '
+                         'differs from the full dataset. In order to use '
+                         '"auto" weights, use compute_class_weight("auto", '
+                         'classes, y). In place of y you can use a large '
+                         'enough sample of the full training set target to '
+                         'properly estimate the class frequency '
+                         'distributions. Pass the resulting weights as the '
+                         'class_weight parameter.')
+            elif self.n_outputs_ > 1:
+                if not hasattr(self.class_weight, "__iter__"):
+                    raise ValueError("For multi-output, class_weight should "
+                                     "be a list of dicts, or a valid string.")
+                elif len(self.class_weight) != self.n_outputs_:
+                    raise ValueError("For multi-output, number of elements "
+                                     "in class_weight should match number of "
+                                     "outputs.")
+
+            if self.class_weight != 'bootstrap' or not self.bootstrap:
+                cw = []
+                for k in range(self.n_outputs_):
+                    if self.class_weight in valid_presets:
+                        class_weight_k = 'auto'
+                    elif self.n_outputs_ == 1:
+                        class_weight_k = self.class_weight
+                    else:
+                        class_weight_k = self.class_weight[k]
+                    cw_part = compute_class_weight(class_weight_k,
+                                                   self.classes_[k],
+                                                   y_org[:, k])
+                    cw_part = cw_part[np.searchsorted(self.classes_[k],
+                                                      y_org[:, k])]
+                    cw.append(cw_part)
+                cw = np.prod(cw, axis=0, dtype=np.float64)
 
         return y, cw
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index ab834821d46fe..6a96b6937d82b 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -811,6 +811,36 @@ def test_class_weight_auto_and_bootstrap_multi_output():
         yield check_class_weight_auto_and_bootstrap_multi_output, name
 
 
+def check_class_weight_errors(name):
+    """Test if class_weight raises errors and warnings when expected."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+
+    # Invalid preset string
+    clf = ForestClassifier(class_weight='the larch', random_state=0)
+    assert_raises(ValueError, clf.fit, X, y)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+    # Warning warm_start with preset
+    clf = ForestClassifier(class_weight='auto', warm_start=True,
+                           random_state=0)
+    assert_warns(UserWarning, clf.fit, X, y)
+    assert_warns(UserWarning, clf.fit, X, _y)
+
+    # Not a list or preset for multi-output
+    clf = ForestClassifier(class_weight=1, random_state=0)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+    # Incorrect length list for multi-output
+    clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+
+def test_class_weight_errors():
+    for name in FOREST_CLASSIFIERS:
+        yield check_class_weight_errors, name
+
+
 def check_warm_start(name, random_state=42):
     """Test if fitting incrementally with warm start gives a forest of the
     right size and the same results as a normal fit."""
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index bca4fafec8816..c8f68b21bbe58 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -896,6 +896,30 @@ def test_class_weights():
         yield check_class_weights, name
 
 
+def check_class_weight_errors(name):
+    """Test if class_weight raises errors and warnings when expected."""
+    TreeClassifier = CLF_TREES[name]
+    _y = np.vstack((y, np.array(y) * 2)).T
+
+    # Invalid preset string
+    clf = TreeClassifier(class_weight='the larch', random_state=0)
+    assert_raises(ValueError, clf.fit, X, y)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+    # Not a list or preset for multi-output
+    clf = TreeClassifier(class_weight=1, random_state=0)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+    # Incorrect length list for multi-output
+    clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0)
+    assert_raises(ValueError, clf.fit, X, _y)
+
+
+def test_class_weight_errors():
+    for name in CLF_TREES:
+        yield check_class_weight_errors, name
+
+
 def test_max_leaf_nodes():
     """Test greedy trees with max_depth + 1 leafs. """
     from sklearn.tree._tree import TREE_LEAF
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 8a50cb66b7da9..f5100f8cc4d37 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -171,6 +171,20 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                 self.n_classes_.append(classes_k.shape[0])
 
             if self.class_weight is not None:
+                if isinstance(self.class_weight, six.string_types):
+                    if self.class_weight != "auto":
+                        raise ValueError('The only supported preset for '
+                                         'class_weight is "auto". Given "%s".'
+                                         % self.class_weight)
+                elif self.n_outputs_ > 1:
+                    if not hasattr(self.class_weight, "__iter__"):
+                        raise ValueError('For multi-output, class_weight '
+                                         'should be a list of dicts, or '
+                                         '"auto".')
+                    elif len(self.class_weight) != self.n_outputs_:
+                        raise ValueError("For multi-output, number of "
+                                         "elements in class_weight should "
+                                         "match number of outputs.")
                 cw = []
                 for k in range(self.n_outputs_):
                     if self.n_outputs_ == 1 or self.class_weight == 'auto':

From ac9ab961bb55d9e39834a521ac022aeb0157bc9d Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Thu, 8 Jan 2015 02:31:24 -0800
Subject: [PATCH 6/8] Y-org rename & whats_new update

---
 doc/whats_new.rst          | 7 +++++++
 sklearn/ensemble/forest.py | 8 ++++----
 sklearn/tree/tree.py       | 6 +++---
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 8816f72590a90..c5db9b5c7bbb5 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -136,6 +136,11 @@ Enhancements
    - Add ``n_iter_`` attribute to estimators that accept a ``max_iter`` attribute
      in their constructor. By `Manoj Kumar`_.
 
+   - Add ``class_weight`` parameter to automatically weight samples by class
+     frequency for :class:`ensemble.RandomForestClassifier`,
+     :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier`
+     and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_.
+
 Documentation improvements
 ..........................
 
@@ -3123,3 +3128,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Matt Terry: https://github.com/mrterry
 
 .. _Antony Lee: https://www.ocf.berkeley.edu/~antonyl/
+
+.. _Trevor Stephens: http://trevorstephens.com/
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 2540f1647e930..d9c1a2864294e 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -406,8 +406,8 @@ def _set_oob_score(self, X, y):
 
         self.oob_score_ = oob_score / self.n_outputs_
 
-    def _validate_y_cw(self, y_org):
-        y = np.copy(y_org)
+    def _validate_y_cw(self, y_original):
+        y = np.copy(y_original)
         cw = None
 
         self.classes_ = []
@@ -455,9 +455,9 @@ def _validate_y_cw(self, y_org):
                         class_weight_k = self.class_weight[k]
                     cw_part = compute_class_weight(class_weight_k,
                                                    self.classes_[k],
-                                                   y_org[:, k])
+                                                   y_original[:, k])
                     cw_part = cw_part[np.searchsorted(self.classes_[k],
-                                                      y_org[:, k])]
+                                                      y_original[:, k])]
                     cw.append(cw_part)
                 cw = np.prod(cw, axis=0, dtype=np.float64)
 
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index f5100f8cc4d37..9cbae9e76e43c 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -163,7 +163,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             self.n_classes_ = []
 
             if self.class_weight is not None:
-                y_org = np.copy(y)
+                y_original = np.copy(y)
 
             for k in range(self.n_outputs_):
                 classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
@@ -193,9 +193,9 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                         class_weight_k = self.class_weight[k]
                     cw_part = compute_class_weight(class_weight_k,
                                                    self.classes_[k],
-                                                   y_org[:, k])
+                                                   y_original[:, k])
                     cw_part = cw_part[np.searchsorted(self.classes_[k],
-                                                      y_org[:, k])]
+                                                      y_original[:, k])]
                     cw.append(cw_part)
                 cw = np.prod(cw, axis=0, dtype=np.float64)
 

From cad87b578cf8a91d4d59834452ff632819d60279 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 10 Jan 2015 11:49:42 -0800
Subject: [PATCH 7/8] rename vars & copy sample_weight

---
 sklearn/ensemble/forest.py | 61 +++++++++++++++++++++-----------------
 sklearn/tree/tree.py       | 26 ++++++++--------
 2 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 3ff2abca16965..43c9866145b98 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -88,25 +88,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         curr_sample_weight *= sample_counts
 
         if class_weight == 'bootstrap':
-            cw = [curr_sample_weight]
+            expanded_class_weight = [curr_sample_weight]
             for k in range(y.shape[1]):
                 y_full = y[:, k]
                 classes_full = np.unique(y_full)
                 y_boot = y_full[indices]
                 classes_boot = np.unique(y_boot)
                 # Get class weights for the bootstrap sample
-                cw_part = compute_class_weight('auto', classes_boot, y_boot)
+                weight_k = compute_class_weight('auto', classes_boot, y_boot)
                 # Expand class weights to cover all classes in original y
                 # (in case some were missing from the bootstrap sample)
-                cw_part = np.array([cw_part[np.where(classes_boot == c)][0]
-                                    if c in classes_boot
-                                    else 0.
-                                    for c in classes_full])
+                weight_k = np.array([weight_k[np.where(classes_boot == c)][0]
+                                     if c in classes_boot
+                                     else 0.
+                                     for c in classes_full])
                 # Expand weights over the original y for this output
-                cw_part = cw_part[np.searchsorted(classes_full, y_full)]
-                cw.append(cw_part)
+                weight_k = weight_k[np.searchsorted(classes_full, y_full)]
+                expanded_class_weight.append(weight_k)
             # Multiply all weights by sample & bootstrap weights
-            curr_sample_weight = np.prod(cw, axis=0, dtype=np.float64)
+            curr_sample_weight = np.prod(expanded_class_weight,
+                                         axis=0,
+                                         dtype=np.float64)
 
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
 
@@ -232,16 +234,16 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        y, cw = self._validate_y_cw(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
-        if cw is not None:
+        if expanded_class_weight is not None:
             if sample_weight is not None:
-                sample_weight *= cw
+                sample_weight = np.copy(sample_weight) * expanded_class_weight
             else:
-                sample_weight = cw
+                sample_weight = expanded_class_weight
 
         # Check parameters
         self._validate_estimator()
@@ -306,7 +308,7 @@ def fit(self, X, y, sample_weight=None):
     def _set_oob_score(self, X, y):
         """Calculate out of bag predictions and score."""
 
-    def _validate_y_cw(self, y):
+    def _validate_y_class_weight(self, y):
         # Default implementation
         return y, None
 
@@ -406,9 +408,12 @@ def _set_oob_score(self, X, y):
 
         self.oob_score_ = oob_score / self.n_outputs_
 
-    def _validate_y_cw(self, y_original):
-        y = np.copy(y_original)
-        cw = None
+    def _validate_y_class_weight(self, y):
+        y = np.copy(y)
+        expanded_class_weight = None
+
+        if self.class_weight is not None:
+            y_original = np.copy(y)
 
         self.classes_ = []
         self.n_classes_ = []
@@ -445,7 +450,7 @@ def _validate_y_cw(self, y_original):
                                      "outputs.")
 
             if self.class_weight != 'bootstrap' or not self.bootstrap:
-                cw = []
+                expanded_class_weight = []
                 for k in range(self.n_outputs_):
                     if self.class_weight in valid_presets:
                         class_weight_k = 'auto'
@@ -453,15 +458,17 @@ def _validate_y_cw(self, y_original):
                         class_weight_k = self.class_weight
                     else:
                         class_weight_k = self.class_weight[k]
-                    cw_part = compute_class_weight(class_weight_k,
-                                                   self.classes_[k],
-                                                   y_original[:, k])
-                    cw_part = cw_part[np.searchsorted(self.classes_[k],
-                                                      y_original[:, k])]
-                    cw.append(cw_part)
-                cw = np.prod(cw, axis=0, dtype=np.float64)
-
-        return y, cw
+                    weight_k = compute_class_weight(class_weight_k,
+                                                    self.classes_[k],
+                                                    y_original[:, k])
+                    weight_k = weight_k[np.searchsorted(self.classes_[k],
+                                                        y_original[:, k])]
+                    expanded_class_weight.append(weight_k)
+                expanded_class_weight = np.prod(expanded_class_weight,
+                                                axis=0,
+                                                dtype=np.float64)
+
+        return y, expanded_class_weight
 
     def predict(self, X):
         """Predict class for X.
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index f90fa582cba68..db53f481f6434 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -147,7 +147,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         is_classification = isinstance(self, ClassifierMixin)
 
         y = np.atleast_1d(y)
-        cw = None
+        expanded_class_weight = None
 
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
@@ -185,19 +185,21 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                         raise ValueError("For multi-output, number of "
                                          "elements in class_weight should "
                                          "match number of outputs.")
-                cw = []
+                expanded_class_weight = []
                 for k in range(self.n_outputs_):
                     if self.n_outputs_ == 1 or self.class_weight == 'auto':
                         class_weight_k = self.class_weight
                     else:
                         class_weight_k = self.class_weight[k]
-                    cw_part = compute_class_weight(class_weight_k,
-                                                   self.classes_[k],
-                                                   y_original[:, k])
-                    cw_part = cw_part[np.searchsorted(self.classes_[k],
-                                                      y_original[:, k])]
-                    cw.append(cw_part)
-                cw = np.prod(cw, axis=0, dtype=np.float64)
+                    weight_k = compute_class_weight(class_weight_k,
+                                                    self.classes_[k],
+                                                    y_original[:, k])
+                    weight_k = weight_k[np.searchsorted(self.classes_[k],
+                                                        y_original[:, k])]
+                    expanded_class_weight.append(weight_k)
+                expanded_class_weight = np.prod(expanded_class_weight,
+                                                axis=0,
+                                                dtype=np.float64)
 
         else:
             self.classes_ = [None] * self.n_outputs_
@@ -274,11 +276,11 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                                  "number of samples=%d" %
                                  (len(sample_weight), n_samples))
 
-        if cw is not None:
+        if expanded_class_weight is not None:
             if sample_weight is not None:
-                sample_weight *= cw
+                sample_weight = np.copy(sample_weight) * expanded_class_weight
             else:
-                sample_weight = cw
+                sample_weight = expanded_class_weight
 
         # Set min_weight_leaf from min_weight_fraction_leaf
         if self.min_weight_fraction_leaf != 0. and sample_weight is not None:

From 35c2535bfc838db31a14b4da9d0f4bd195b9e252 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 12 Jan 2015 21:27:13 -0800
Subject: [PATCH 8/8] rename cw option to subsample & refactor its
 implementation

---
 sklearn/ensemble/forest.py            | 41 ++++++++++++++-------------
 sklearn/ensemble/tests/test_forest.py |  2 +-
 sklearn/tree/tree.py                  |  2 +-
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index f383964065402..076df001b9899 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -87,24 +87,27 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         sample_counts = np.bincount(indices, minlength=n_samples)
         curr_sample_weight *= sample_counts
 
-        if class_weight == 'bootstrap':
+        if class_weight == 'subsample':
+
             expanded_class_weight = [curr_sample_weight]
+
             for k in range(y.shape[1]):
                 y_full = y[:, k]
                 classes_full = np.unique(y_full)
-                y_boot = y_full[indices]
+                y_boot = y[indices, k]
                 classes_boot = np.unique(y_boot)
-                # Get class weights for the bootstrap sample
-                weight_k = compute_class_weight('auto', classes_boot, y_boot)
-                # Expand class weights to cover all classes in original y
-                # (in case some were missing from the bootstrap sample)
-                weight_k = np.array([weight_k[np.where(classes_boot == c)][0]
-                                     if c in classes_boot
-                                     else 0.
-                                     for c in classes_full])
+
+                # Get class weights for the bootstrap sample, covering all
+                # classes in case some were missing from the bootstrap sample
+                weight_k = np.choose(
+                    np.searchsorted(classes_boot, classes_full),
+                    compute_class_weight('auto', classes_boot, y_boot),
+                    mode='clip')
+
                 # Expand weights over the original y for this output
                 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
                 expanded_class_weight.append(weight_k)
+
             # Multiply all weights by sample & bootstrap weights
             curr_sample_weight = np.prod(expanded_class_weight,
                                          axis=0,
@@ -243,7 +246,7 @@ def fit(self, X, y, sample_weight=None):
 
         if expanded_class_weight is not None:
             if sample_weight is not None:
-                sample_weight = np.copy(sample_weight) * expanded_class_weight
+                sample_weight = sample_weight * expanded_class_weight
             else:
                 sample_weight = expanded_class_weight
 
@@ -428,14 +431,14 @@ def _validate_y_class_weight(self, y):
             self.n_classes_.append(classes_k.shape[0])
 
         if self.class_weight is not None:
-            valid_presets = ['auto', 'bootstrap']
+            valid_presets = ('auto', 'subsample')
             if isinstance(self.class_weight, six.string_types):
                 if self.class_weight not in valid_presets:
                     raise ValueError('Valid presets for class_weight include '
-                                     '"auto" and "bootstrap". Given "%s".'
+                                     '"auto" and "subsample". Given "%s".'
                                      % self.class_weight)
                 if self.warm_start:
-                    warn('class_weight presets "auto" or "bootstrap" are '
+                    warn('class_weight presets "auto" or "subsample" are '
                          'not recommended for warm_start if the fitted data '
                          'differs from the full dataset. In order to use '
                          '"auto" weights, use compute_class_weight("auto", '
@@ -453,7 +456,7 @@ def _validate_y_class_weight(self, y):
                                      "in class_weight should match number of "
                                      "outputs.")
 
-            if self.class_weight != 'bootstrap' or not self.bootstrap:
+            if self.class_weight != 'subsample' or not self.bootstrap:
                 expanded_class_weight = []
                 for k in range(self.n_outputs_):
                     if self.class_weight in valid_presets:
@@ -797,7 +800,7 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional
+    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
@@ -807,7 +810,7 @@ class RandomForestClassifier(ForestClassifier):
         The "auto" mode uses the values of y to automatically adjust
         weights inversely proportional to class frequencies in the input data.
 
-        The "bootstrap" mode is the same as "auto" except that weights are
+        The "subsample" mode is the same as "auto" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.
@@ -1127,7 +1130,7 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "bootstrap" or None, optional
+    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
@@ -1137,7 +1140,7 @@ class ExtraTreesClassifier(ForestClassifier):
         The "auto" mode uses the values of y to automatically adjust
         weights inversely proportional to class frequencies in the input data.
 
-        The "bootstrap" mode is the same as "auto" except that weights are
+        The "subsample" mode is the same as "auto" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 6a96b6937d82b..cd0697af20500 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -802,7 +802,7 @@ def check_class_weight_auto_and_bootstrap_multi_output(name):
     clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
                            random_state=0)
     clf.fit(X, _y)
-    clf = ForestClassifier(class_weight='bootstrap', random_state=0)
+    clf = ForestClassifier(class_weight='subsample', random_state=0)
     clf.fit(X, _y)
 
 
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 64989a356a75f..6dd0cf2999bcc 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -279,7 +279,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         if expanded_class_weight is not None:
             if sample_weight is not None:
-                sample_weight = np.copy(sample_weight) * expanded_class_weight
+                sample_weight = sample_weight * expanded_class_weight
             else:
                 sample_weight = expanded_class_weight