scikit-learn
diff --git a/‎build_tools/azure/install.sh
Lines changed: 2 additions & 2 deletions b/‎build_tools/azure/install.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎build_tools/azure/posix-docker.yml
Lines changed: 15 additions & 0 deletions b/‎build_tools/azure/posix-docker.yml
Lines changed: 15 additions & 0 deletions
diff --git a/‎build_tools/azure/posix.yml
Lines changed: 3 additions & 1 deletion b/‎build_tools/azure/posix.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/glossary.rst
Lines changed: 1 addition & 0 deletions b/‎doc/glossary.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/whats_new/v1.1.rst
Lines changed: 33 additions & 0 deletions b/‎doc/whats_new/v1.1.rst
Lines changed: 33 additions & 0 deletions
diff --git a/‎examples/applications/plot_model_complexity_influence.py
Lines changed: 2 additions & 1 deletion b/‎examples/applications/plot_model_complexity_influence.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎sklearn/cluster/_optics.py
Lines changed: 2 additions & 2 deletions b/‎sklearn/cluster/_optics.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎sklearn/cluster/tests/test_hierarchical.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/cluster/tests/test_hierarchical.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sklearn/discriminant_analysis.py
Lines changed: 4 additions & 2 deletions b/‎sklearn/discriminant_analysis.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎sklearn/ensemble/_voting.py
Lines changed: 27 additions & 5 deletions b/‎sklearn/ensemble/_voting.py
Lines changed: 27 additions & 5 deletions
diff --git a/‎sklearn/gaussian_process/_gpc.py
Lines changed: 5 additions & 1 deletion b/‎sklearn/gaussian_process/_gpc.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎sklearn/gaussian_process/tests/test_gpc.py
Lines changed: 23 additions & 1 deletion b/‎sklearn/gaussian_process/tests/test_gpc.py
Lines changed: 23 additions & 1 deletion
diff --git a/‎sklearn/linear_model/_coordinate_descent.py
Lines changed: 8 additions & 0 deletions b/‎sklearn/linear_model/_coordinate_descent.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎sklearn/linear_model/_quantile.py
Lines changed: 15 additions & 12 deletions b/‎sklearn/linear_model/_quantile.py
Lines changed: 15 additions & 12 deletions
@@ -23,10 +23,10 @@ make_conda() {
 }
 
 setup_ccache() {
-    echo "Setting up ccache"
+    echo "Setting up ccache with CCACHE_DIR=${CCACHE_DIR}"
     mkdir /tmp/ccache/
     which ccache
-    for name in gcc g++ cc c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++; do
+    for name in gcc g++ cc c++ clang clang++ i686-linux-gnu-gcc i686-linux-gnu-c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++ x86_64-apple-darwin13.4.0-clang x86_64-apple-darwin13.4.0-clang++; do
       ln -s $(which ccache) "/tmp/ccache/${name}"
     done
     export PATH="/tmp/ccache/:${PATH}"
 
@@ -36,18 +36,31 @@ jobs:
     DISTRIB: ''
     DOCKER_CONTAINER: ''
     SHOW_SHORT_SUMMARY: 'false'
+    CCACHE_DIR: $(Pipeline.Workspace)/ccache
+    CCACHE_COMPRESS: '1'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}
 
   steps:
+    - task: Cache@2
+      inputs:
+        key: '"ccache-v1" | "$(Agent.JobName)" | "$(Build.BuildNumber)"'
+        restoreKeys: |
+          "ccache-v1" | "$(Agent.JobName)"
+        path: $(CCACHE_DIR)
+      displayName: ccache
+      continueOnError: true
+    - script: >
+        mkdir -p $CCACHE_DIR
     # Container is detached and sleeping, allowing steps to run commands
     # in the container. The TEST_DIR is mapped allowing the host to access
     # the JUNITXML file
     - script: >
         docker container run --rm
         --volume $TEST_DIR:/temp_dir
         --volume $PWD:/io
+        --volume $CCACHE_DIR:/ccache
         -w /io
         --detach
         --name skcontainer
@@ -71,6 +84,8 @@ jobs:
         -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS
         -e BLAS=$BLAS
         -e CPU_COUNT=$CPU_COUNT
+        -e CCACHE_DIR=/ccache
+        -e CCACHE_COMPRESS=$CCACHE_COMPRESS
         $DOCKER_CONTAINER
         sleep 1000000
       displayName: 'Start container'
 
@@ -50,7 +50,9 @@ jobs:
       condition: startsWith(variables['DISTRIB'], 'conda')
     - task: Cache@2
       inputs:
-        key: '"$(Agent.JobName)"'
+        key: '"ccache-v1" | "$(Agent.JobName)" | "$(Build.BuildNumber)"'
+        restoreKeys: |
+          "ccache-v1" | "$(Agent.JobName)"
         path: $(CCACHE_DIR)
       displayName: ccache
       continueOnError: true
 
@@ -1604,6 +1604,7 @@ functions or non-estimator constructors.
             number of different distinct random seeds. Popular integer
             random seeds are 0 and `42
             <https://en.wikipedia.org/wiki/Answer_to_the_Ultimate_Question_of_Life%2C_the_Universe%2C_and_Everything>`_.
+            Integer values must be in the range `[0, 2**32 - 1]`.
 
         A :class:`numpy.random.RandomState` instance
             Use the provided random state, only affecting other users
 
@@ -273,6 +273,13 @@ Changelog
   F-statistic).
   :pr:`17819` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
 
+:mod:`sklearn.gaussian_process`
+...............................
+
+- |Fix| :class:`gaussian_process.GaussianProcessClassifier` raises
+  a more informative error if `CompoundKernel` is passed via `kernel`.
+  :pr:`22223` by :user:`MarcoM <marcozzxx810>`.
+
 :mod:`sklearn.impute`
 .....................
 
@@ -322,6 +329,9 @@ Changelog
 - |Enhancement| :class:`linear_model.QuantileRegressor` support sparse input
   for the highs based solvers.
   :pr:`21086` by :user:`Venkatachalam Natchiappan <venkyyuvy>`.
+  In addition, those solvers now use the CSC matrix right from the
+  beginning which speeds up fitting.
+  :pr:`22206` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| Rename parameter `base_estimator` to `estimator` in
   :class:`linear_model.RANSACRegressor` to improve readability and consistency.
@@ -334,6 +344,11 @@ Changelog
   :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
   :user:`Andrés Babino <ababino>`.
 
+- |Enhancement| :func:`linear_model.ElasticNet` and 
+  and other linear model classes using coordinate descent show error
+  messages when non-finite parameter weights are produced. :pr:`22148`
+  by :user:`Christian Ritter <chritter>` and :user:`Norbert Preining <norbusan>`.
+
 - |Fix| :class:`linear_model.ElasticNetCV` now produces correct
   warning when `l1_ratio=0`.
   :pr:`21724` by :user:`Yar Khine Phyo <yarkhinephyo>`.
@@ -359,6 +374,11 @@ Changelog
   A deprecation cycle was introduced.
   :pr:`21576` by :user:`Paul-Emile Dugnat <pedugnat>`.
 
+- |API| The `"wminkowski"` metric of :class:`sklearn.metrics.DistanceMetric` is deprecated
+  and will be removed in version 1.3. Instead the existing `"minkowski"` metric now takes
+  in an optional `w` parameter for weights. This deprecation aims at remaining consistent
+  with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo <yarkhinephyo>`
+
 - |Fix| :func:`metrics.silhouette_score` now supports integer input for precomputed
   distances. :pr:`22108` by `Thomas Fan`_.
 
@@ -382,6 +402,11 @@ Changelog
   splits failed. Similarly raise an error during grid-search when the fits for
   all the models and all the splits failed. :pr:`21026` by :user:`Loïc Estève <lesteve>`.
 
+- |Enhancement| it is now possible to pass `scoring="matthews_corrcoef"` to all
+  model selection tools with a `scoring` argument to use the Matthews
+  correlation coefficient (MCC). :pr:`22203` by :user:`Olivier Grisel
+  <ogrisel>`.
+
 - |Fix| :class:`model_selection.GridSearchCV`,
   :class:`model_selection.HalvingGridSearchCV`
   now validate input parameters in `fit` instead of `__init__`.
@@ -412,6 +437,14 @@ Changelog
   instead of `__init__`. :pr:`21430` by :user:`Desislava Vasileva <DessyVV>` and
   :user:`Lucy Jimenez <LucyJimenez>`.
 
+:mod:`sklearn.neural_network`
+.............................
+
+- |Enhancement| :func:`neural_network.MLPClassifier` and 
+  :func:`neural_network.MLPRegressor` show error
+  messages when optimizers produce non-finite parameter weights. :pr:`22150`
+  by :user:`Christian Ritter <chritter>` and :user:`Norbert Preining <norbusan>`.
+
 :mod:`sklearn.pipeline`
 .......................
 
 
@@ -165,7 +165,8 @@ def _count_nonzero_coefficients(estimator):
             "alpha": 0.001,
             "loss": "modified_huber",
             "fit_intercept": True,
-            "tol": 1e-3,
+            "tol": 1e-1,
+            "n_iter_no_change": 2,
         },
         "changing_param": "l1_ratio",
         "changing_param_values": [0.25, 0.5, 0.75, 0.9],
 
@@ -673,13 +673,13 @@ def cluster_optics_xi(
     Parameters
     ----------
     reachability : ndarray of shape (n_samples,)
-        Reachability distances calculated by OPTICS (`reachability_`)
+        Reachability distances calculated by OPTICS (`reachability_`).
 
     predecessor : ndarray of shape (n_samples,)
         Predecessors calculated by OPTICS.
 
     ordering : ndarray of shape (n_samples,)
-        OPTICS ordered point indices (`ordering_`)
+        OPTICS ordered point indices (`ordering_`).
 
     min_samples : int > 1 or float between 0 and 1
         The same as the min_samples given to OPTICS. Up and down steep regions
 
@@ -410,6 +410,8 @@ def test_vector_scikit_single_vs_scipy_single(seed):
     assess_same_labelling(cut, cut_scipy)
 
 
+# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
+@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
 def test_mst_linkage_core_memory_mapped(metric_param_grid):
     """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
 
@@ -631,8 +631,10 @@ def transform(self, X):
 
         Returns
         -------
-        X_new : ndarray of shape (n_samples, n_components)
-            Transformed data.
+        X_new : ndarray of shape (n_samples, n_components) or \
+            (n_samples, min(rank, n_components))
+            Transformed data. In the case of the 'svd' solver, the shape
+            is (n_samples, min(rank, n_components)).
         """
         if self.solver == "lsqr":
             raise NotImplementedError(
 
@@ -157,8 +157,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     estimators : list of (str, estimator) tuples
         Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
         of those original estimators that will be stored in the class attribute
-        ``self.estimators_``. An estimator can be set to ``'drop'``
-        using ``set_params``.
+        ``self.estimators_``. An estimator can be set to ``'drop'`` using
+        :meth:`set_params`.
 
         .. versionchanged:: 0.21
             ``'drop'`` is accepted. Using None was deprecated in 0.22 and
@@ -254,6 +254,18 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     >>> eclf2 = eclf2.fit(X, y)
     >>> print(eclf2.predict(X))
     [1 1 1 2 2 2]
+
+    To drop an estimator, :meth:`set_params` can be used to remove it. Here we
+    dropped one of the estimators, resulting in 2 fitted estimators:
+
+    >>> eclf2 = eclf2.set_params(lr='drop')
+    >>> eclf2 = eclf2.fit(X, y)
+    >>> len(eclf2.estimators_)
+    2
+
+    Setting `flatten_transform=True` with `voting='soft'` flattens output shape of
+    `transform`:
+
     >>> eclf3 = VotingClassifier(estimators=[
     ...        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
     ...        voting='soft', weights=[2,1,1],
@@ -434,7 +446,7 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
         Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
         of those original estimators that will be stored in the class attribute
         ``self.estimators_``. An estimator can be set to ``'drop'`` using
-        ``set_params``.
+        :meth:`set_params`.
 
         .. versionchanged:: 0.21
             ``'drop'`` is accepted. Using None was deprecated in 0.22 and
@@ -488,13 +500,23 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
     >>> from sklearn.linear_model import LinearRegression
     >>> from sklearn.ensemble import RandomForestRegressor
     >>> from sklearn.ensemble import VotingRegressor
+    >>> from sklearn.neighbors import KNeighborsRegressor
     >>> r1 = LinearRegression()
     >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
+    >>> r3 = KNeighborsRegressor()
     >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
     >>> y = np.array([2, 6, 12, 20, 30, 42])
-    >>> er = VotingRegressor([('lr', r1), ('rf', r2)])
+    >>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
     >>> print(er.fit(X, y).predict(X))
-    [ 3.3  5.7 11.8 19.7 28.  40.3]
+    [ 6.8...  8.4... 12.5... 17.8... 26...  34...]
+
+    In the following example, we drop the `'lr'` estimator with
+    :meth:`~VotingRegressor.set_params` and fit the remaining two estimators:
+
+    >>> er = er.set_params(lr='drop')
+    >>> er = er.fit(X, y)
+    >>> len(er.estimators_)
+    2
     """
 
     def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
 
@@ -503,7 +503,8 @@ class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
     kernel : kernel instance, default=None
         The kernel specifying the covariance function of the GP. If None is
         passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
-        the kernel's hyperparameters are optimized during fitting.
+        the kernel's hyperparameters are optimized during fitting. Also kernel
+        cannot be a `CompoundKernel`.
 
     optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
         Can either be one of the internally supported optimizers for optimizing
@@ -673,6 +674,9 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
+        if isinstance(self.kernel, CompoundKernel):
+            raise ValueError("kernel cannot be a CompoundKernel")
+
         if self.kernel is None or self.kernel.requires_vector_input:
             X, y = self._validate_data(
                 X, y, multi_output=False, ensure_2d=True, dtype="numeric"
 
@@ -11,7 +11,12 @@
 import pytest
 
 from sklearn.gaussian_process import GaussianProcessClassifier
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    CompoundKernel,
+    ConstantKernel as C,
+    WhiteKernel,
+)
 from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
 from sklearn.exceptions import ConvergenceWarning
 
@@ -260,3 +265,20 @@ def test_warning_bounds():
         "Increasing the bound and calling "
         "fit again may find a better value."
     )
+
+
+@pytest.mark.parametrize(
+    "params, error_type, err_msg",
+    [
+        (
+            {"kernel": CompoundKernel(0)},
+            ValueError,
+            "kernel cannot be a CompoundKernel",
+        )
+    ],
+)
+def test_gpc_fit_error(params, error_type, err_msg):
+    """Check that expected error are raised during fit."""
+    gpc = GaussianProcessClassifier(**params)
+    with pytest.raises(error_type, match=err_msg):
+        gpc.fit(X, y)
@@ -1075,6 +1075,14 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         # workaround since _set_intercept will cast self.coef_ into X.dtype
         self.coef_ = np.asarray(self.coef_, dtype=X.dtype)
 
+        # check for finiteness of coefficients
+        if not all(np.isfinite(w).all() for w in [self.coef_, self.intercept_]):
+            raise ValueError(
+                "Coordinate descent iterations resulted in non-finite parameter"
+                " values. The input data may contain large values and need to"
+                " be preprocessed."
+            )
+
         # return self for chaining fit and predict calls
         return self
 
 
@@ -47,7 +47,7 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
         programming formulation. Note that the highs methods are recommended
         for usage with `scipy>=1.6.0` because they are the fastest ones.
         Solvers "highs-ds", "highs-ipm" and "highs" support
-        sparse input data.
+        sparse input data and, in fact, always convert to sparse csc.
 
     solver_options : dict, default=None
         Additional parameters passed to :func:`scipy.optimize.linprog` as
@@ -193,6 +193,12 @@ def fit(self, X, y, sample_weight=None):
                 f"with scipy>=1.6.0, got {sp_version}"
             )
 
+        if sparse.issparse(X) and self.solver not in ["highs", "highs-ds", "highs-ipm"]:
+            raise ValueError(
+                f"Solver {self.solver} does not support sparse X. "
+                "Use solver 'highs' for example."
+            )
+
         if self.solver_options is not None and not isinstance(
             self.solver_options, dict
         ):
@@ -214,14 +220,14 @@ def fit(self, X, y, sample_weight=None):
         #     min_x c x
         #           A_eq x = b_eq
         #                0 <= x
-        # x = (s0, s, t0, t, u, v) = slack variables
+        # x = (s0, s, t0, t, u, v) = slack variables >= 0
         # intercept = s0 - t0
         # coef = s - t
-        # c = (alpha * 1_p, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
+        # c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
         # residual = y - X@coef - intercept = u - v
         # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
         # b_eq = y
-        # p = n_features + fit_intercept
+        # p = n_features
         # n = n_samples
         # 1_n = vector of length n with entries equal one
         # see https://stats.stackexchange.com/questions/384909/
@@ -246,14 +252,11 @@ def fit(self, X, y, sample_weight=None):
             c[0] = 0
             c[n_params] = 0
 
-        if sparse.issparse(X):
-            if self.solver not in ["highs-ds", "highs-ipm", "highs"]:
-                raise ValueError(
-                    f"Solver {self.solver} does not support sparse X. "
-                    "Use solver 'highs' for example."
-                )
-            # Note that highs methods do convert to csc.
-            # Therefore, we work with csc matrices as much as possible.
+        if self.solver in ["highs", "highs-ds", "highs-ipm"]:
+            # Note that highs methods always use a sparse CSC memory layout internally,
+            # even for optimization problems parametrized using dense numpy arrays.
+            # Therefore, we work with CSC matrices as early as possible to limit
+            # unnecessary repeated memory copies.
             eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
             if self.fit_intercept:
                 ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))