API Rename OneHotEncoder option sparse to sparse_output (#24412)

rusdes · lorentzenchr · jeremiedbb · web-flow · commit 7134e4183cc4 · 2022-09-16T13:46:53.000+02:00
Co-authored-by: Christian Lorentzen &lt;lorentzen.ch@gmail.com&gt;
Co-authored-by: Jérémie du Boisberranger &lt;34657725+jeremiedbb@users.noreply.github.com&gt;
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -689,7 +689,7 @@ the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros
 to the dropped category if a category is dropped and `None` if a category is
 not dropped::
 
-    >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
+    >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse_output=False,
     ...                                        handle_unknown='ignore').fit(X)
     >>> X_test = [['unknown', 'America', 'IE']]
     >>> X_trans = drop_enc.transform(X_test)
@@ -755,7 +755,7 @@ infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
    ...               ['snake'] * 3], dtype=object).T
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse=False).fit(X)
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
    >>> enc.infrequent_categories_
    [array(['dog', 'snake'], dtype=object)]
    >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
@@ -768,7 +768,7 @@ By setting handle_unknown to `'infrequent_if_exist'`, unknown categories will
 be considered infrequent::
 
    >>> enc = preprocessing.OneHotEncoder(
-   ...    handle_unknown='infrequent_if_exist', sparse=False, min_frequency=6)
+   ...    handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=6)
    >>> enc = enc.fit(X)
    >>> enc.transform(np.array([['dragon']]))
    array([[0., 0., 1.]])
@@ -797,7 +797,7 @@ the output. This will result in all but the `'cat'` category to be considered
 infrequent, leading to two features, one for `'cat'` and one for infrequent
 categories - which are all the others::
 
-   >>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse=False)
+   >>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse_output=False)
    >>> enc = enc.fit(X)
    >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
    array([[0., 1.],
@@ -811,7 +811,7 @@ categories are kept. In the following example, `min_frequency=4` considers
 only `snake` to be infrequent, but `max_categories=3`, forces `dog` to also be
 infrequent::
 
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse=False)
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse_output=False)
    >>> enc = enc.fit(X)
    >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
    array([[0., 0., 1.],
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -404,6 +404,10 @@ Changelog
   `n_features_in_` and `feature_names_in_` regardless of the `validate` parameter.
   :pr:`23993` by `Thomas Fan`_.
 
+- |API| The `sparse` parameter of :class:`preprocessing.OneHotEncoder`
+  is now deprecated and will be removed in version 1.4. Use `sparse_output` instead.
+  :pr:`24412` by :user:`Rushil Desai <rusdes>`.
+
 :mod:`sklearn.svm`
 ..................
 
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -497,8 +497,8 @@ def test_column_transformer_sparse_threshold():
     for thres in [0.75001, 1]:
         col_trans = ColumnTransformer(
             [
-                ("trans1", OneHotEncoder(sparse=True), [0]),
-                ("trans2", OneHotEncoder(sparse=False), [1]),
+                ("trans1", OneHotEncoder(sparse_output=True), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
             ],
             sparse_threshold=thres,
         )
@@ -509,8 +509,8 @@ def test_column_transformer_sparse_threshold():
     for thres in [0.75, 0]:
         col_trans = ColumnTransformer(
             [
-                ("trans1", OneHotEncoder(sparse=True), [0]),
-                ("trans2", OneHotEncoder(sparse=False), [1]),
+                ("trans1", OneHotEncoder(sparse_output=True), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
             ],
             sparse_threshold=thres,
         )
@@ -522,8 +522,8 @@ def test_column_transformer_sparse_threshold():
     for thres in [0.33, 0, 1]:
         col_trans = ColumnTransformer(
             [
-                ("trans1", OneHotEncoder(sparse=False), [0]),
-                ("trans2", OneHotEncoder(sparse=False), [1]),
+                ("trans1", OneHotEncoder(sparse_output=False), [0]),
+                ("trans2", OneHotEncoder(sparse_output=False), [1]),
             ],
             sparse_threshold=thres,
         )
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -2757,7 +2757,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
         y = rnd.uniform(size=_num_samples(X))
         super().fit(X, y, sample_weight=sample_weight)
 
-        self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
+        self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
         output = self.one_hot_encoder_.fit_transform(self.apply(X))
         self._n_features_out = output.shape[1]
         return output
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -964,7 +964,7 @@ def test_categorical_encoding_strategies():
     # Using OHEd data, we need less splits than with pure OEd data, but we
     # still need more splits than with the native categorical splits
     ct = make_column_transformer(
-        (OneHotEncoder(sparse=False), [1]), remainder="passthrough"
+        (OneHotEncoder(sparse_output=False), [1]), remainder="passthrough"
     )
     X_ohe = ct.fit_transform(X)
     clf_no_cat.set_params(max_depth=2)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -521,7 +521,7 @@ def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
     n_samples = 10_000
     X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8)
 
-    X_ohe = OneHotEncoder(sparse=False).fit_transform(X_binned)
+    X_ohe = OneHotEncoder(sparse_output=False).fit_transform(X_binned)
     X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)
 
     if target == "equal":
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -289,7 +289,7 @@ def fit(self, X, y=None):
         if "onehot" in self.encode:
             self._encoder = OneHotEncoder(
                 categories=[np.arange(i) for i in self.n_bins_],
-                sparse=self.encode == "onehot",
+                sparse_output=self.encode == "onehot",
                 dtype=output_dtype,
             )
             # Fit the OneHotEncoder with toy datasets
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -14,8 +14,7 @@
 from ..utils.deprecation import deprecated
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_feature_names_in
-from ..utils._param_validation import Interval
-from ..utils._param_validation import StrOptions
+from ..utils._param_validation import Interval, StrOptions, Hidden
 from ..utils._mask import _get_mask
 
 from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
@@ -209,7 +208,7 @@ class OneHotEncoder(_BaseEncoder):
     strings, denoting the values taken on by categorical (discrete) features.
     The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
     encoding scheme. This creates a binary column for each category and
-    returns a sparse matrix or dense array (depending on the ``sparse``
+    returns a sparse matrix or dense array (depending on the ``sparse_output``
     parameter)
 
     By default, the encoder derives the categories based on the unique values
@@ -271,6 +270,16 @@ class OneHotEncoder(_BaseEncoder):
     sparse : bool, default=True
         Will return sparse matrix if set True else will return an array.
 
+        .. deprecated:: 1.2
+           `sparse` is deprecated in 1.2 and will be removed in 1.4. Use
+           `sparse_output` instead.
+
+    sparse_output : bool, default=True
+        Will return sparse matrix if set True else will return an array.
+
+        .. versionadded:: 1.2
+           `sparse` was renamed to `sparse_output`
+
     dtype : number type, default=float
         Desired dtype of output.
 
@@ -331,7 +340,7 @@ class OneHotEncoder(_BaseEncoder):
         (if any).
 
     drop_idx_ : array of shape (n_features,)
-        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
           to be dropped for each feature.
         - ``drop_idx_[i] = None`` if no category is to be dropped from the
           feature with index ``i``, e.g. when `drop='if_binary'` and the
@@ -425,7 +434,7 @@ class OneHotEncoder(_BaseEncoder):
 
     >>> import numpy as np
     >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
-    >>> ohe = OneHotEncoder(max_categories=3, sparse=False).fit(X)
+    >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
     >>> ohe.infrequent_categories_
     [array(['a', 'd'], dtype=object)]
     >>> ohe.transform([["a"], ["b"]])
@@ -444,22 +453,26 @@ class OneHotEncoder(_BaseEncoder):
             Interval(Real, 0, 1, closed="neither"),
             None,
         ],
-        "sparse": ["boolean"],
+        "sparse": [Hidden(StrOptions({"deprecated"})), "boolean"],  # deprecated
+        "sparse_output": ["boolean"],
     }
 
     def __init__(
         self,
         *,
         categories="auto",
         drop=None,
-        sparse=True,
+        sparse="deprecated",
+        sparse_output=True,
         dtype=np.float64,
         handle_unknown="error",
         min_frequency=None,
         max_categories=None,
     ):
         self.categories = categories
+        # TODO(1.4): Remove self.sparse
         self.sparse = sparse
+        self.sparse_output = sparse_output
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
@@ -798,6 +811,16 @@ def fit(self, X, y=None):
             Fitted encoder.
         """
         self._validate_params()
+
+        if self.sparse != "deprecated":
+            warnings.warn(
+                "`sparse` was renamed to `sparse_output` in version 1.2 and "
+                "will be removed in 1.4. `sparse_out` is ignored unless you "
+                "leave `sparse` to its default value.",
+                FutureWarning,
+            )
+            self.sparse_output = self.sparse
+
         self._check_infrequent_enabled()
 
         fit_results = self._fit(
@@ -830,7 +853,7 @@ def transform(self, X):
         -------
         X_out : {ndarray, sparse matrix} of shape \
                 (n_samples, n_encoded_features)
-            Transformed input. If `sparse=True`, a sparse matrix will be
+            Transformed input. If `sparse_output=True`, a sparse matrix will be
             returned.
         """
         check_is_fitted(self)
@@ -879,7 +902,7 @@ def transform(self, X):
             shape=(n_samples, feature_indices[-1]),
             dtype=self.dtype,
         )
-        if not self.sparse:
+        if not self.sparse_output:
             return out.toarray()
         else:
             return out
diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -139,15 +139,17 @@ def test_encode_options():
     assert not sp.issparse(Xt_2)
     assert_array_equal(
         OneHotEncoder(
-            categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
+            categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
         ).fit_transform(Xt_1),
         Xt_2,
     )
     est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
     Xt_3 = est.transform(X)
     assert sp.issparse(Xt_3)
     assert_array_equal(
-        OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
+        OneHotEncoder(
+            categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
+        )
         .fit_transform(Xt_1)
         .toarray(),
         Xt_3.toarray(),
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py

Original file line number	Diff line number	Diff line change
`@@ -497,8 +497,8 @@ def test_column_transformer_sparse_threshold():`
`497`	`497`	`for thres in [0.75001, 1]:`
`498`	`498`	`col_trans = ColumnTransformer(`
`499`	`499`	`[`
`500`		`- ("trans1", OneHotEncoder(sparse=True), [0]),`
`501`		`- ("trans2", OneHotEncoder(sparse=False), [1]),`
	`500`	`+ ("trans1", OneHotEncoder(sparse_output=True), [0]),`
	`501`	`+ ("trans2", OneHotEncoder(sparse_output=False), [1]),`
`502`	`502`	`],`
`503`	`503`	`sparse_threshold=thres,`
`504`	`504`	`)`
`@@ -509,8 +509,8 @@ def test_column_transformer_sparse_threshold():`
`509`	`509`	`for thres in [0.75, 0]:`
`510`	`510`	`col_trans = ColumnTransformer(`
`511`	`511`	`[`
`512`		`- ("trans1", OneHotEncoder(sparse=True), [0]),`
`513`		`- ("trans2", OneHotEncoder(sparse=False), [1]),`
	`512`	`+ ("trans1", OneHotEncoder(sparse_output=True), [0]),`
	`513`	`+ ("trans2", OneHotEncoder(sparse_output=False), [1]),`
`514`	`514`	`],`
`515`	`515`	`sparse_threshold=thres,`
`516`	`516`	`)`
`@@ -522,8 +522,8 @@ def test_column_transformer_sparse_threshold():`
`522`	`522`	`for thres in [0.33, 0, 1]:`
`523`	`523`	`col_trans = ColumnTransformer(`
`524`	`524`	`[`
`525`		`- ("trans1", OneHotEncoder(sparse=False), [0]),`
`526`		`- ("trans2", OneHotEncoder(sparse=False), [1]),`
	`525`	`+ ("trans1", OneHotEncoder(sparse_output=False), [0]),`
	`526`	`+ ("trans2", OneHotEncoder(sparse_output=False), [1]),`
`527`	`527`	`],`
`528`	`528`	`sparse_threshold=thres,`
`529`	`529`	`)`
Original file line number	Diff line number	Diff line change
`@@ -964,7 +964,7 @@ def test_categorical_encoding_strategies():`
`964`	`964`	`# Using OHEd data, we need less splits than with pure OEd data, but we`
`965`	`965`	`# still need more splits than with the native categorical splits`
`966`	`966`	`ct = make_column_transformer(`
`967`		`- (OneHotEncoder(sparse=False), [1]), remainder="passthrough"`
	`967`	`+ (OneHotEncoder(sparse_output=False), [1]), remainder="passthrough"`
`968`	`968`	`)`
`969`	`969`	`X_ohe = ct.fit_transform(X)`
`970`	`970`	`clf_no_cat.set_params(max_depth=2)`
Original file line number	Diff line number	Diff line change
`@@ -289,7 +289,7 @@ def fit(self, X, y=None):`
`289`	`289`	`if "onehot" in self.encode:`
`290`	`290`	`self._encoder = OneHotEncoder(`
`291`	`291`	`categories=[np.arange(i) for i in self.n_bins_],`
`292`		`- sparse=self.encode == "onehot",`
	`292`	`+ sparse_output=self.encode == "onehot",`
`293`	`293`	`dtype=output_dtype,`
`294`	`294`	`)`
`295`	`295`	`# Fit the OneHotEncoder with toy datasets`