scikit-learn
diff --git a/‎.landscape.yml
Lines changed: 0 additions & 5 deletions b/‎.landscape.yml
Lines changed: 0 additions & 5 deletions
diff --git a/‎azure-pipelines.yml
Lines changed: 6 additions & 0 deletions b/‎azure-pipelines.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/bench_hist_gradient_boosting.py
Lines changed: 26 additions & 7 deletions b/‎benchmarks/bench_hist_gradient_boosting.py
Lines changed: 26 additions & 7 deletions
diff --git a/‎build_tools/azure/install.cmd
Lines changed: 3 additions & 1 deletion b/‎build_tools/azure/install.cmd
Lines changed: 3 additions & 1 deletion
diff --git a/‎build_tools/azure/install.sh
Lines changed: 4 additions & 2 deletions b/‎build_tools/azure/install.sh
Lines changed: 4 additions & 2 deletions
diff --git a/‎build_tools/azure/posix-32.yml
Lines changed: 1 addition & 0 deletions b/‎build_tools/azure/posix-32.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/developers/develop.rst
Lines changed: 19 additions & 14 deletions b/‎doc/developers/develop.rst
Lines changed: 19 additions & 14 deletions
diff --git a/‎doc/developers/tips.rst
Lines changed: 2 additions & 0 deletions b/‎doc/developers/tips.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/modules/clustering.rst
Lines changed: 8 additions & 14 deletions b/‎doc/modules/clustering.rst
Lines changed: 8 additions & 14 deletions
diff --git a/‎doc/modules/ensemble.rst
Lines changed: 34 additions & 2 deletions b/‎doc/modules/ensemble.rst
Lines changed: 34 additions & 2 deletions
@@ -39,6 +39,7 @@ jobs:
         PILLOW_VERSION: '*'
         PYTEST_VERSION: '*'
         JOBLIB_VERSION: '*'
+        THREADPOOLCTL_VERSION: '2.0.0'
         COVERAGE: 'true'
 
 - template: build_tools/azure/posix.yml
@@ -54,6 +55,7 @@ jobs:
         DISTRIB: 'ubuntu'
         PYTHON_VERSION: '3.6'
         JOBLIB_VERSION: '0.11'
+        THREADPOOLCTL_VERSION: '2.0.0'
       # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB
       py36_conda_openblas:
         DISTRIB: 'conda'
@@ -70,6 +72,7 @@ jobs:
         SCIKIT_IMAGE_VERSION: '*'
         # latest version of joblib available in conda for Python 3.6
         JOBLIB_VERSION: '0.13.2'
+        THREADPOOLCTL_VERSION: '2.0.0'
         COVERAGE: 'true'
       # Linux environment to test the latest available dependencies and MKL.
       # It runs tests requiring lightgbm, pandas and PyAMG.
@@ -92,6 +95,7 @@ jobs:
         DISTRIB: 'ubuntu-32'
         PYTHON_VERSION: '3.6'
         JOBLIB_VERSION: '0.13'
+        THREADPOOLCTL_VERSION: '2.0.0'
 
 - template: build_tools/azure/posix.yml
   parameters:
@@ -109,6 +113,7 @@ jobs:
         PILLOW_VERSION: '*'
         PYTEST_VERSION: '*'
         JOBLIB_VERSION: '*'
+        THREADPOOLCTL_VERSION: '2.0.0'
         COVERAGE: 'true'
       pylatest_conda_mkl_no_openmp:
         DISTRIB: 'conda'
@@ -120,6 +125,7 @@ jobs:
         PILLOW_VERSION: '*'
         PYTEST_VERSION: '*'
         JOBLIB_VERSION: '*'
+        THREADPOOLCTL_VERSION: '2.0.0'
         COVERAGE: 'true'
         SKLEARN_TEST_NO_OPENMP: 'true'
         SKLEARN_SKIP_OPENMP_TEST: 'true'
 
@@ -32,6 +32,9 @@
 parser.add_argument('--n-samples-max', type=int, default=int(1e6))
 parser.add_argument('--n-features', type=int, default=20)
 parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--random-sample-weights', action="store_true",
+                    default=False,
+                    help="generate and use random sample weights")
 args = parser.parse_args()
 
 n_leaf_nodes = args.n_leaf_nodes
@@ -46,6 +49,7 @@ def get_estimator_and_data():
                                    n_features=args.n_features,
                                    n_classes=args.n_classes,
                                    n_clusters_per_class=1,
+                                   n_informative=args.n_classes,
                                    random_state=0)
         return X, y, HistGradientBoostingClassifier
     elif args.problem == 'regression':
@@ -60,15 +64,30 @@ def get_estimator_and_data():
         np.bool)
     X[mask] = np.nan
 
-X_train_, X_test_, y_train_, y_test_ = train_test_split(
-    X, y, test_size=0.5, random_state=0)
+if args.random_sample_weights:
+    sample_weight = np.random.rand(len(X)) * 10
+else:
+    sample_weight = None
+
+if sample_weight is not None:
+    (X_train_, X_test_, y_train_, y_test_,
+     sample_weight_train_, _) = train_test_split(
+        X, y, sample_weight, test_size=0.5, random_state=0)
+else:
+    X_train_, X_test_, y_train_, y_test_ = train_test_split(
+        X, y, test_size=0.5, random_state=0)
+    sample_weight_train_ = None
 
 
 def one_run(n_samples):
     X_train = X_train_[:n_samples]
     X_test = X_test_[:n_samples]
     y_train = y_train_[:n_samples]
     y_test = y_test_[:n_samples]
+    if sample_weight is not None:
+        sample_weight_train = sample_weight_train_[:n_samples]
+    else:
+        sample_weight_train = None
     assert X_train.shape[0] == n_samples
     assert X_test.shape[0] == n_samples
     print("Data size: %d samples train, %d samples test."
@@ -79,7 +98,7 @@ def one_run(n_samples):
                     max_iter=n_trees,
                     max_bins=max_bins,
                     max_leaf_nodes=n_leaf_nodes,
-                    n_iter_no_change=None,
+                    early_stopping=False,
                     random_state=0,
                     verbose=0)
     loss = args.loss
@@ -93,7 +112,7 @@ def one_run(n_samples):
         if loss == 'default':
             loss = 'least_squares'
     est.set_params(loss=loss)
-    est.fit(X_train, y_train)
+    est.fit(X_train, y_train, sample_weight=sample_weight_train)
     sklearn_fit_duration = time() - tic
     tic = time()
     sklearn_score = est.score(X_test, y_test)
@@ -110,7 +129,7 @@ def one_run(n_samples):
         lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
 
         tic = time()
-        lightgbm_est.fit(X_train, y_train)
+        lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         lightgbm_fit_duration = time() - tic
         tic = time()
         lightgbm_score = lightgbm_est.score(X_test, y_test)
@@ -127,7 +146,7 @@ def one_run(n_samples):
         xgb_est = get_equivalent_estimator(est, lib='xgboost')
 
         tic = time()
-        xgb_est.fit(X_train, y_train)
+        xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         xgb_fit_duration = time() - tic
         tic = time()
         xgb_score = xgb_est.score(X_test, y_test)
@@ -144,7 +163,7 @@ def one_run(n_samples):
         cat_est = get_equivalent_estimator(est, lib='catboost')
 
         tic = time()
-        cat_est.fit(X_train, y_train)
+        cat_est.fit(X_train, y_train, sample_weight=sample_weight_train)
         cat_fit_duration = time() - tic
         tic = time()
         cat_score = cat_est.score(X_test, y_test)
 
@@ -15,14 +15,16 @@ IF "%PYTHON_ARCH%"=="64" (
 
     call activate %VIRTUALENV%
 
+    pip install threadpoolctl
+
     IF "%PYTEST_VERSION%"=="*" (
         pip install pytest
     ) else (
         pip install pytest==%PYTEST_VERSION%
     )
     pip install pytest-xdist
 ) else (
-    pip install numpy scipy cython pytest wheel pillow joblib
+    pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl
 )
 if "%COVERAGE%" == "true" (
     pip install coverage codecov pytest-cov
 
@@ -65,6 +65,8 @@ if [[ "$DISTRIB" == "conda" ]]; then
 
 	make_conda $TO_INSTALL
 
+    pip install threadpoolctl==$THREADPOOLCTL_VERSION
+
     if [[ "$PYTEST_VERSION" == "*" ]]; then
         python -m pip install pytest
     else
@@ -81,13 +83,13 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv
     python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV
     source $VIRTUALENV/bin/activate
-    python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION
+    python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION threadpoolctl==$THREADPOOLCTL_VERSION
 elif [[ "$DISTRIB" == "ubuntu-32" ]]; then
     apt-get update
     apt-get install -y python3-dev python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv
     python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV
     source $VIRTUALENV/bin/activate
-    python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION
+    python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION threadpoolctl==$THREADPOOLCTL_VERSION
 elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     # Since conda main channel usually lacks behind on the latest releases,
     # we use pypi to test against the latest releases of the dependencies.
 
@@ -36,6 +36,7 @@ jobs:
         -e JUNITXML=$JUNITXML
         -e VIRTUALENV=testvenv
         -e JOBLIB_VERSION=$JOBLIB_VERSION
+        -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION
         -e PYTEST_VERSION=$PYTEST_VERSION
         -e OMP_NUM_THREADS=$OMP_NUM_THREADS
         -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS
 
@@ -481,58 +481,63 @@ runtime. The default values for the estimator tags are defined in the
 
 The current set of estimator tags are:
 
-allow_nan (default=``False``)
+allow_nan (default=False)
     whether the estimator supports data with missing values encoded as np.NaN
 
-binary_only (default=``False``)
+binary_only (default=False)
     whether estimator supports binary classification but lacks multi-class
     classification support.
 
-multilabel (default=``False``)
+multilabel (default=False)
     whether the estimator supports multilabel output
 
-multioutput (default=``False``)
+multioutput (default=False)
     whether a regressor supports multi-target outputs or a classifier supports
     multi-class multi-output.
 
-multioutput_only (default=``False``)
+multioutput_only (default=False)
     whether estimator supports only multi-output classification or regression.
 
-no_validation (default=``False``)
+no_validation (default=False)
     whether the estimator skips input-validation. This is only meant for
     stateless and dummy transformers!
 
-non_deterministic (default=``False``)
+non_deterministic (default=False)
     whether the estimator is not deterministic given a fixed ``random_state``
 
-poor_score (default=``False``)
+poor_score (default=False)
     whether the estimator fails to provide a "reasonable" test-set score, which
     currently for regression is an R2 of 0.5 on a subset of the boston housing
     dataset, and for classification an accuracy of 0.83 on
     ``make_blobs(n_samples=300, random_state=0)``. These datasets and values
     are based on current estimators in sklearn and might be replaced by
     something more systematic.
 
-requires_fit (default=``True``)
+requires_fit (default=True)
     whether the estimator requires to be fitted before calling one of
     `transform`, `predict`, `predict_proba`, or `decision_function`.
 
-requires_positive_X (default=``False``)
+requires_positive_X (default=False)
     whether the estimator requires positive X.
 
-requires_positive_y (default=``False``)
+requires_positive_y (default=False)
     whether the estimator requires a positive y (only applicable for regression).
 
-_skip_test (default=``False``)
+_skip_test (default=False)
     whether to skip common tests entirely. Don't use this unless you have a
     *very good* reason.
 
-stateless (default=``False``)
+_xfail_test (default=False)
+    dictionary ``{check_name : reason}`` of common checks to mark as a
+    known failure, with the associated reason. Don't use this unless you have a
+    *very good* reason.
+
+stateless (default=False)
     whether the estimator needs access to data for fitting. Even though an
     estimator is stateless, it might still need a call to ``fit`` for
     initialization.
 
-X_types (default=``['2darray']``)
+X_types (default=['2darray'])
     Supported input types for X as list of strings. Tests are currently only
     run if '2darray' is contained in the list, signifying that the estimator
     takes continuous 2d numpy arrays as input. The default value is
 
@@ -86,6 +86,8 @@ Other `pytest` options that may become useful include:
   - ``-s`` so that pytest does not capture the output of ``print()``
     statements
   - ``--tb=short`` or ``--tb=line`` to control the length of the logs
+  - ``--runxfail`` also run tests marked as a known failure (XFAIL) and report
+    errors.
 
 Since our continuous integration tests will error if
 ``FutureWarning`` isn't properly caught,
 
@@ -205,23 +205,17 @@ computing cluster centers and values of inertia. For example, assigning a
 weight of 2 to a sample is equivalent to adding a duplicate of that sample
 to the dataset :math:`X`.
 
-A parameter can be given to allow K-means to be run in parallel, called
-``n_jobs``. Giving this parameter a positive value uses that many processors
-(default: 1). A value of -1 uses all available processors, with -2 using one
-less, and so on. Parallelization generally speeds up computation at the cost of
-memory (in this case, multiple copies of centroids need to be stored, one for
-each job).
-
-.. warning::
-
-    The parallel version of K-Means is broken on OS X when `numpy` uses the
-    `Accelerate` Framework. This is expected behavior: `Accelerate` can be called
-    after a fork but you need to execv the subprocess with the Python binary
-    (which multiprocessing does not do under posix).
-
 K-means can be used for vector quantization. This is achieved using the
 transform method of a trained model of :class:`KMeans`.
 
+Low-level parallelism
+---------------------
+
+:class:`KMeans` benefits from OpenMP based parallelism through Cython. Small
+chunks of data (256 samples) are processed in parallel, which in addition
+yields a low memory footprint. For more details on how to control the number of
+threads, please refer to our :ref:`parallelism` notes.
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`: Demonstrating when
 
@@ -856,8 +856,7 @@ leverage integer-based data structures (histograms) instead of relying on
 sorted continuous values when building the trees. The API of these
 estimators is slightly different, and some of the features from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`
-are not yet supported: in particular sample weights, and some loss
-functions.
+are not yet supported, for instance some loss functions.
 
 These estimators are still **experimental**: their predictions
 and their API might change without any deprecation cycle. To use them, you
@@ -957,6 +956,39 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+Sample weight support
+---------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` sample support weights during
+:term:`fit`.
+
+The following toy example demonstrates how the model ignores the samples with
+zero sample weights:
+
+    >>> X = [[1, 0],
+    ...      [1, 0],
+    ...      [1, 0],
+    ...      [0, 1]]
+    >>> y = [0, 0, 1, 0]
+    >>> # ignore the first 2 training samples by setting their weight to 0
+    >>> sample_weight = [0, 0, 1, 1]
+    >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1)
+    >>> gb.fit(X, y, sample_weight=sample_weight)
+    HistGradientBoostingClassifier(...)
+    >>> gb.predict([[1, 0]])
+    array([1])
+    >>> gb.predict_proba([[1, 0]])[0, 1]
+    0.99...
+
+As you can see, the `[1, 0]` is comfortably classified as `1` since the first
+two samples are ignored due to their sample weights.
+
+Implementation detail: taking sample weights into account amounts to
+multiplying the gradients (and the hessians) by the sample weights. Note that
+the binning stage (specifically the quantiles computation) does not take the
+weights into account.
+
 Low-level parallelism
 ---------------------