thomasjpfan · thomasjpfan · Dec 28, 2021 · Dec 28, 2021 · Jan 5, 2022 · Jan 13, 2022
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -148,7 +148,7 @@ Changelog
   :user:`Sebastian Pujalte <pujaltes>`.
 
 - |Enhancement| :func:`datasets.make_blobs` no longer copies data during the generation
-  process, therefore uses less memory. 
+  process, therefore uses less memory.
   :pr:`22412` by :user:`Zhehao Liu <MaxwellLZH>`.
 
 - |Enhancement| :func:`datasets.load_diabetes` now accepts the parameter
@@ -491,6 +491,9 @@ Changelog
 :mod:`sklearn.mixture`
 ......................
 
+- |Enhancement| Added ArrayAPI support for :class:`mixture.GaussianMixture`
+  for `init_params="random"` and `covariance_type="full"`. :pr:`xxxxx` by `Thomas Fan`_.
+
 - |Fix| Fix a bug that correctly initialize `precisions_cholesky_` in
   :class:`mixture.GaussianMixture` when providing `precisions_init` by taking
   its square root.

diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -9,6 +9,7 @@
     "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
     "print_changed_only": True,
     "display": "text",
+    "array_api_dispatch": False,
 }
 _threadlocal = threading.local()
 
@@ -40,7 +41,11 @@ def get_config():
 
 
 def set_config(
-    assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    array_api_dispatch=None,
 ):
     """Set global scikit-learn configuration
 
@@ -80,6 +85,11 @@ def set_config(
 
         .. versionadded:: 0.23
 
+    array_api_dispatch : bool, default=None
+        Configure scikit-learn to use Array-API. Global default: False
+
+        .. versionadded:: 1.2
+
     See Also
     --------
     config_context : Context manager for global scikit-learn configuration.
@@ -95,11 +105,18 @@ def set_config(
         local_config["print_changed_only"] = print_changed_only
     if display is not None:
         local_config["display"] = display
+    if array_api_dispatch is not None:
+        local_config["array_api_dispatch"] = array_api_dispatch
 
 
 @contextmanager
 def config_context(
-    *, assume_finite=None, working_memory=None, print_changed_only=None, display=None
+    *,
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    array_api_dispatch=None,
 ):
     """Context manager for global scikit-learn configuration.
 
@@ -138,6 +155,11 @@ def config_context(
 
         .. versionadded:: 0.23
 
+    array_api_dispatch : bool, default=None
+        Configure scikit-learn to use Array-API. Global default: False
+
+        .. versionadded:: 1.2
+
     Yields
     ------
     None.
@@ -171,6 +193,7 @@ def config_context(
         working_memory=working_memory,
         print_changed_only=print_changed_only,
         display=display,
+        array_api_dispatch=array_api_dispatch,
     )
 
     try:

diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
@@ -9,13 +9,13 @@
 from time import time
 
 import numpy as np
-from scipy.special import logsumexp
 
 from .. import cluster
 from ..base import BaseEstimator
 from ..base import DensityMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state
+from ..utils._array_api import get_namespace, logsumexp
 from ..utils.validation import check_is_fitted
 
 
@@ -148,8 +148,10 @@ def _initialize_parameters(self, X, random_state):
             )
             resp[np.arange(n_samples), label] = 1
         elif self.init_params == "random":
+            xp, _ = get_namespace(X)
             resp = random_state.uniform(size=(n_samples, self.n_components))
-            resp /= resp.sum(axis=1)[:, np.newaxis]
+            resp = xp.asarray(resp)
+            resp /= xp.reshape(xp.sum(resp, axis=1), (-1, 1))
         else:
             raise ValueError(
                 "Unimplemented initialization method '%s'" % self.init_params
@@ -225,7 +227,8 @@ def fit_predict(self, X, y=None):
         labels : array, shape (n_samples,)
             Component labels.
         """
-        X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2)
+        xp, _ = get_namespace(X)
+        X = self._validate_data(X, dtype=[xp.float64, xp.float32], ensure_min_samples=2)
         if X.shape[0] < self.n_components:
             raise ValueError(
                 "Expected n_samples >= n_components "
@@ -238,7 +241,7 @@ def fit_predict(self, X, y=None):
         do_init = not (self.warm_start and hasattr(self, "converged_"))
         n_init = self.n_init if do_init else 1
 
-        max_lower_bound = -np.inf
+        max_lower_bound = -xp.inf
         self.converged_ = False
 
         random_state = check_random_state(self.random_state)
@@ -250,7 +253,7 @@ def fit_predict(self, X, y=None):
             if do_init:
                 self._initialize_parameters(X, random_state)
 
-            lower_bound = -np.inf if do_init else self.lower_bound_
+            lower_bound = -xp.inf if do_init else self.lower_bound_
 
             for n_iter in range(1, self.max_iter + 1):
                 prev_lower_bound = lower_bound
@@ -268,7 +271,7 @@ def fit_predict(self, X, y=None):
 
             self._print_verbose_msg_init_end(lower_bound)
 
-            if lower_bound > max_lower_bound or max_lower_bound == -np.inf:
+            if lower_bound > max_lower_bound or max_lower_bound == -xp.inf:
                 max_lower_bound = lower_bound
                 best_params = self._get_parameters()
                 best_n_iter = n_iter
@@ -291,7 +294,7 @@ def fit_predict(self, X, y=None):
         # for any value of max_iter and tol (and any random_state).
         _, log_resp = self._e_step(X)
 
-        return log_resp.argmax(axis=1)
+        return xp.argmax(log_resp, axis=1)
 
     def _e_step(self, X):
         """E step.
@@ -309,8 +312,9 @@ def _e_step(self, X):
             Logarithm of the posterior probabilities (or responsibilities) of
             the point of each sample in X.
         """
+        xp, _ = get_namespace(X)
         log_prob_norm, log_resp = self._estimate_log_prob_resp(X)
-        return np.mean(log_prob_norm), log_resp
+        return xp.mean(log_prob_norm), log_resp
 
     @abstractmethod
     def _m_step(self, X, log_resp):
@@ -529,11 +533,12 @@ def _estimate_log_prob_resp(self, X):
         log_responsibilities : array, shape (n_samples, n_components)
             logarithm of the responsibilities
         """
+        xp, _ = get_namespace(X)
         weighted_log_prob = self._estimate_weighted_log_prob(X)
         log_prob_norm = logsumexp(weighted_log_prob, axis=1)
         with np.errstate(under="ignore"):
             # ignore underflow
-            log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
+            log_resp = weighted_log_prob - xp.reshape(log_prob_norm, (-1, 1))
         return log_prob_norm, log_resp
 
     def _print_verbose_msg_init_beg(self, n_init):

diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
@@ -5,12 +5,16 @@
 # License: BSD 3 clause
 
 import numpy as np
+from math import log
+from functools import partial
 
 from scipy import linalg
+import scipy
 
 from ._base import BaseMixture, _check_shape
 from ..utils import check_array
 from ..utils.extmath import row_norms
+from ..utils._array_api import get_namespace
 
 
 ###############################################################################
@@ -171,12 +175,17 @@ def _estimate_gaussian_covariances_full(resp, X, nk, means, reg_covar):
     covariances : array, shape (n_components, n_features, n_features)
         The covariance matrix of the current components.
     """
+    xp, is_array_api = get_namespace(resp, X, nk)
     n_components, n_features = means.shape
-    covariances = np.empty((n_components, n_features, n_features))
+    covariances = xp.empty((n_components, n_features, n_features))
     for k in range(n_components):
-        diff = X - means[k]
-        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
-        covariances[k].flat[:: n_features + 1] += reg_covar
+        diff = X - means[k, :]
+        covariances[k, :, :] = ((resp[:, k] * diff.T) @ diff) / nk[k]
+        if is_array_api:
+            for i in range(n_features):
+                covariances[k, i, i] += reg_covar
+        else:
+            covariances[k].flat[:: n_features + 1] += reg_covar
     return covariances
 
 
@@ -286,8 +295,9 @@ def _estimate_gaussian_parameters(X, resp, reg_covar, covariance_type):
         The covariance matrix of the current components.
         The shape depends of the covariance_type.
     """
-    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
-    means = np.dot(resp.T, X) / nk[:, np.newaxis]
+    xp, _ = get_namespace(X, resp)
+    nk = xp.sum(resp, axis=0) + 10 * xp.finfo(resp.dtype).eps
+    means = resp.T @ X / xp.reshape(nk, (-1, 1))
     covariances = {
         "full": _estimate_gaussian_covariances_full,
         "tied": _estimate_gaussian_covariances_tied,
@@ -321,27 +331,34 @@ def _compute_precision_cholesky(covariances, covariance_type):
         "or collapsed samples). Try to decrease the number of components, "
         "or increase reg_covar."
     )
+    xp, is_array_api = get_namespace(covariances)
+    if is_array_api:
+        cholesky = xp.linalg.cholesky
+        solve = xp.linalg.solve
+    else:
+        cholesky = partial(scipy.linalg.cholesky, lower=True)
+        solve = partial(scipy.linalg.solve_triangular, lower=True)
 
     if covariance_type == "full":
         n_components, n_features, _ = covariances.shape
-        precisions_chol = np.empty((n_components, n_features, n_features))
-        for k, covariance in enumerate(covariances):
+        precisions_chol = xp.empty((n_components, n_features, n_features))
+        for k in range(n_components):
             try:
-                cov_chol = linalg.cholesky(covariance, lower=True)
+                cov_chol = cholesky(covariances[k, :, :])
             except linalg.LinAlgError:
                 raise ValueError(estimate_precision_error_message)
-            precisions_chol[k] = linalg.solve_triangular(
-                cov_chol, np.eye(n_features), lower=True
-            ).T
+            precisions_chol[k, :, :] = solve(cov_chol, xp.eye(n_features)).T
+
+            if is_array_api:
+                precisions_chol[k, :, :] = xp.triu(precisions_chol[k, :, :])
+
     elif covariance_type == "tied":
         _, n_features = covariances.shape
         try:
-            cov_chol = linalg.cholesky(covariances, lower=True)
+            cov_chol = cholesky(covariances)
         except linalg.LinAlgError:
             raise ValueError(estimate_precision_error_message)
-        precisions_chol = linalg.solve_triangular(
-            cov_chol, np.eye(n_features), lower=True
-        ).T
+        precisions_chol = linalg.solve(cov_chol, np.eye(n_features)).T
     else:
         if np.any(np.less_equal(covariances, 0.0)):
             raise ValueError(estimate_precision_error_message)
@@ -373,20 +390,20 @@ def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
     log_det_precision_chol : array-like of shape (n_components,)
         The determinant of the precision matrix for each component.
     """
+    xp, _ = get_namespace(matrix_chol)
     if covariance_type == "full":
         n_components, _, _ = matrix_chol.shape
-        log_det_chol = np.sum(
-            np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), 1
-        )
+        matrix_col_reshape = xp.reshape(matrix_chol, (n_components, -1))
+        log_det_chol = xp.sum(xp.log(matrix_col_reshape[:, :: n_features + 1]), axis=1)
 
     elif covariance_type == "tied":
-        log_det_chol = np.sum(np.log(np.diag(matrix_chol)))
+        log_det_chol = xp.sum(xp.log(xp.diag(matrix_chol)))
 
     elif covariance_type == "diag":
-        log_det_chol = np.sum(np.log(matrix_chol), axis=1)
+        log_det_chol = xp.sum(xp.log(matrix_chol), axis=1)
 
     else:
-        log_det_chol = n_features * (np.log(matrix_chol))
+        log_det_chol = n_features * (xp.log(matrix_chol))
 
     return log_det_chol
 
@@ -413,6 +430,7 @@ def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
     -------
     log_prob : array, shape (n_samples, n_components)
     """
+    xp, _ = get_namespace(X, means, precisions_chol)
     n_samples, n_features = X.shape
     n_components, _ = means.shape
     # The determinant of the precision matrix from the Cholesky decomposition
@@ -422,10 +440,12 @@ def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
     log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features)
 
     if covariance_type == "full":
-        log_prob = np.empty((n_samples, n_components))
-        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
-            y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
-            log_prob[:, k] = np.sum(np.square(y), axis=1)
+        log_prob = xp.empty((n_samples, n_components))
+        for k in range(n_components):
+            mu = means[k, :]
+            prec_chol = precisions_chol[k, :, :]
+            y = X @ prec_chol - mu @ prec_chol
+            log_prob[:, k] = xp.sum(xp.square(y), axis=1)
 
     elif covariance_type == "tied":
         log_prob = np.empty((n_samples, n_components))
@@ -450,7 +470,7 @@ def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
         )
     # Since we are using the precision of the Cholesky decomposition,
     # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol`
-    return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
+    return -0.5 * (n_features * log(2 * xp.pi) + log_prob) + log_det
 
 
 class GaussianMixture(BaseMixture):
@@ -742,8 +762,9 @@ def _m_step(self, X, log_resp):
             the point of each sample in X.
         """
         n_samples, _ = X.shape
+        xp, _ = get_namespace(X, log_resp)
         self.weights_, self.means_, self.covariances_ = _estimate_gaussian_parameters(
-            X, np.exp(log_resp), self.reg_covar, self.covariance_type
+            X, xp.exp(log_resp), self.reg_covar, self.covariance_type
         )
         self.weights_ /= n_samples
         self.precisions_cholesky_ = _compute_precision_cholesky(
@@ -756,7 +777,8 @@ def _estimate_log_prob(self, X):
         )
 
     def _estimate_log_weights(self):
-        return np.log(self.weights_)
+        xp, _ = get_namespace(self.weights_)
+        return xp.log(self.weights_)
 
     def _compute_lower_bound(self, _, log_prob_norm):
         return log_prob_norm
@@ -779,11 +801,12 @@ def _set_parameters(self, params):
 
         # Attributes computation
         _, n_features = self.means_.shape
-
         if self.covariance_type == "full":
-            self.precisions_ = np.empty(self.precisions_cholesky_.shape)
-            for k, prec_chol in enumerate(self.precisions_cholesky_):
-                self.precisions_[k] = np.dot(prec_chol, prec_chol.T)
+            prec_cho = self.precisions_cholesky_
+            xp, _ = get_namespace(prec_cho)
+            self.precisions_ = xp.empty(prec_cho.shape)
+            for k in range(prec_cho.shape[0]):
+                self.precisions_[k, :, :] = prec_cho[k, :, :] @ prec_cho[k, :, :].T
 
         elif self.covariance_type == "tied":
             self.precisions_ = np.dot(