Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7134e41

Browse files
rusdeslorentzenchrjeremiedbb
authored
API Rename OneHotEncoder option sparse to sparse_output (#24412)
Co-authored-by: Christian Lorentzen <[email protected]> Co-authored-by: Jérémie du Boisberranger <[email protected]>
1 parent 268d6b4 commit 7134e41

File tree

10 files changed

+120
-62
lines changed

10 files changed

+120
-62
lines changed

doc/modules/preprocessing.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,7 @@ the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros
689689
to the dropped category if a category is dropped and `None` if a category is
690690
not dropped::
691691

692-
>>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
692+
>>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse_output=False,
693693
... handle_unknown='ignore').fit(X)
694694
>>> X_test = [['unknown', 'America', 'IE']]
695695
>>> X_trans = drop_enc.transform(X_test)
@@ -755,7 +755,7 @@ infrequent::
755755

756756
>>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
757757
... ['snake'] * 3], dtype=object).T
758-
>>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse=False).fit(X)
758+
>>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
759759
>>> enc.infrequent_categories_
760760
[array(['dog', 'snake'], dtype=object)]
761761
>>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
@@ -768,7 +768,7 @@ By setting handle_unknown to `'infrequent_if_exist'`, unknown categories will
768768
be considered infrequent::
769769

770770
>>> enc = preprocessing.OneHotEncoder(
771-
... handle_unknown='infrequent_if_exist', sparse=False, min_frequency=6)
771+
... handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=6)
772772
>>> enc = enc.fit(X)
773773
>>> enc.transform(np.array([['dragon']]))
774774
array([[0., 0., 1.]])
@@ -797,7 +797,7 @@ the output. This will result in all but the `'cat'` category to be considered
797797
infrequent, leading to two features, one for `'cat'` and one for infrequent
798798
categories - which are all the others::
799799

800-
>>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse=False)
800+
>>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse_output=False)
801801
>>> enc = enc.fit(X)
802802
>>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
803803
array([[0., 1.],
@@ -811,7 +811,7 @@ categories are kept. In the following example, `min_frequency=4` considers
811811
only `snake` to be infrequent, but `max_categories=3`, forces `dog` to also be
812812
infrequent::
813813

814-
>>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse=False)
814+
>>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse_output=False)
815815
>>> enc = enc.fit(X)
816816
>>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
817817
array([[0., 0., 1.],

doc/whats_new/v1.2.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,10 @@ Changelog
404404
`n_features_in_` and `feature_names_in_` regardless of the `validate` parameter.
405405
:pr:`23993` by `Thomas Fan`_.
406406

407+
- |API| The `sparse` parameter of :class:`preprocessing.OneHotEncoder`
408+
is now deprecated and will be removed in version 1.4. Use `sparse_output` instead.
409+
:pr:`24412` by :user:`Rushil Desai <rusdes>`.
410+
407411
:mod:`sklearn.svm`
408412
..................
409413

sklearn/compose/tests/test_column_transformer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -497,8 +497,8 @@ def test_column_transformer_sparse_threshold():
497497
for thres in [0.75001, 1]:
498498
col_trans = ColumnTransformer(
499499
[
500-
("trans1", OneHotEncoder(sparse=True), [0]),
501-
("trans2", OneHotEncoder(sparse=False), [1]),
500+
("trans1", OneHotEncoder(sparse_output=True), [0]),
501+
("trans2", OneHotEncoder(sparse_output=False), [1]),
502502
],
503503
sparse_threshold=thres,
504504
)
@@ -509,8 +509,8 @@ def test_column_transformer_sparse_threshold():
509509
for thres in [0.75, 0]:
510510
col_trans = ColumnTransformer(
511511
[
512-
("trans1", OneHotEncoder(sparse=True), [0]),
513-
("trans2", OneHotEncoder(sparse=False), [1]),
512+
("trans1", OneHotEncoder(sparse_output=True), [0]),
513+
("trans2", OneHotEncoder(sparse_output=False), [1]),
514514
],
515515
sparse_threshold=thres,
516516
)
@@ -522,8 +522,8 @@ def test_column_transformer_sparse_threshold():
522522
for thres in [0.33, 0, 1]:
523523
col_trans = ColumnTransformer(
524524
[
525-
("trans1", OneHotEncoder(sparse=False), [0]),
526-
("trans2", OneHotEncoder(sparse=False), [1]),
525+
("trans1", OneHotEncoder(sparse_output=False), [0]),
526+
("trans2", OneHotEncoder(sparse_output=False), [1]),
527527
],
528528
sparse_threshold=thres,
529529
)

sklearn/ensemble/_forest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2757,7 +2757,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
27572757
y = rnd.uniform(size=_num_samples(X))
27582758
super().fit(X, y, sample_weight=sample_weight)
27592759

2760-
self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
2760+
self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
27612761
output = self.one_hot_encoder_.fit_transform(self.apply(X))
27622762
self._n_features_out = output.shape[1]
27632763
return output

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -964,7 +964,7 @@ def test_categorical_encoding_strategies():
964964
# Using OHEd data, we need less splits than with pure OEd data, but we
965965
# still need more splits than with the native categorical splits
966966
ct = make_column_transformer(
967-
(OneHotEncoder(sparse=False), [1]), remainder="passthrough"
967+
(OneHotEncoder(sparse_output=False), [1]), remainder="passthrough"
968968
)
969969
X_ohe = ct.fit_transform(X)
970970
clf_no_cat.set_params(max_depth=2)

sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
521521
n_samples = 10_000
522522
X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8)
523523

524-
X_ohe = OneHotEncoder(sparse=False).fit_transform(X_binned)
524+
X_ohe = OneHotEncoder(sparse_output=False).fit_transform(X_binned)
525525
X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)
526526

527527
if target == "equal":

sklearn/preprocessing/_discretization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ def fit(self, X, y=None):
289289
if "onehot" in self.encode:
290290
self._encoder = OneHotEncoder(
291291
categories=[np.arange(i) for i in self.n_bins_],
292-
sparse=self.encode == "onehot",
292+
sparse_output=self.encode == "onehot",
293293
dtype=output_dtype,
294294
)
295295
# Fit the OneHotEncoder with toy datasets

sklearn/preprocessing/_encoders.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
from ..utils.deprecation import deprecated
1515
from ..utils.validation import check_is_fitted
1616
from ..utils.validation import _check_feature_names_in
17-
from ..utils._param_validation import Interval
18-
from ..utils._param_validation import StrOptions
17+
from ..utils._param_validation import Interval, StrOptions, Hidden
1918
from ..utils._mask import _get_mask
2019

2120
from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
@@ -209,7 +208,7 @@ class OneHotEncoder(_BaseEncoder):
209208
strings, denoting the values taken on by categorical (discrete) features.
210209
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
211210
encoding scheme. This creates a binary column for each category and
212-
returns a sparse matrix or dense array (depending on the ``sparse``
211+
returns a sparse matrix or dense array (depending on the ``sparse_output``
213212
parameter)
214213
215214
By default, the encoder derives the categories based on the unique values
@@ -271,6 +270,16 @@ class OneHotEncoder(_BaseEncoder):
271270
sparse : bool, default=True
272271
Will return sparse matrix if set True else will return an array.
273272
273+
.. deprecated:: 1.2
274+
`sparse` is deprecated in 1.2 and will be removed in 1.4. Use
275+
`sparse_output` instead.
276+
277+
sparse_output : bool, default=True
278+
Will return sparse matrix if set True else will return an array.
279+
280+
.. versionadded:: 1.2
281+
`sparse` was renamed to `sparse_output`
282+
274283
dtype : number type, default=float
275284
Desired dtype of output.
276285
@@ -331,7 +340,7 @@ class OneHotEncoder(_BaseEncoder):
331340
(if any).
332341
333342
drop_idx_ : array of shape (n_features,)
334-
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
343+
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
335344
to be dropped for each feature.
336345
- ``drop_idx_[i] = None`` if no category is to be dropped from the
337346
feature with index ``i``, e.g. when `drop='if_binary'` and the
@@ -425,7 +434,7 @@ class OneHotEncoder(_BaseEncoder):
425434
426435
>>> import numpy as np
427436
>>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
428-
>>> ohe = OneHotEncoder(max_categories=3, sparse=False).fit(X)
437+
>>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
429438
>>> ohe.infrequent_categories_
430439
[array(['a', 'd'], dtype=object)]
431440
>>> ohe.transform([["a"], ["b"]])
@@ -444,22 +453,26 @@ class OneHotEncoder(_BaseEncoder):
444453
Interval(Real, 0, 1, closed="neither"),
445454
None,
446455
],
447-
"sparse": ["boolean"],
456+
"sparse": [Hidden(StrOptions({"deprecated"})), "boolean"], # deprecated
457+
"sparse_output": ["boolean"],
448458
}
449459

450460
def __init__(
451461
self,
452462
*,
453463
categories="auto",
454464
drop=None,
455-
sparse=True,
465+
sparse="deprecated",
466+
sparse_output=True,
456467
dtype=np.float64,
457468
handle_unknown="error",
458469
min_frequency=None,
459470
max_categories=None,
460471
):
461472
self.categories = categories
473+
# TODO(1.4): Remove self.sparse
462474
self.sparse = sparse
475+
self.sparse_output = sparse_output
463476
self.dtype = dtype
464477
self.handle_unknown = handle_unknown
465478
self.drop = drop
@@ -798,6 +811,16 @@ def fit(self, X, y=None):
798811
Fitted encoder.
799812
"""
800813
self._validate_params()
814+
815+
if self.sparse != "deprecated":
816+
warnings.warn(
817+
"`sparse` was renamed to `sparse_output` in version 1.2 and "
818+
"will be removed in 1.4. `sparse_out` is ignored unless you "
819+
"leave `sparse` to its default value.",
820+
FutureWarning,
821+
)
822+
self.sparse_output = self.sparse
823+
801824
self._check_infrequent_enabled()
802825

803826
fit_results = self._fit(
@@ -830,7 +853,7 @@ def transform(self, X):
830853
-------
831854
X_out : {ndarray, sparse matrix} of shape \
832855
(n_samples, n_encoded_features)
833-
Transformed input. If `sparse=True`, a sparse matrix will be
856+
Transformed input. If `sparse_output=True`, a sparse matrix will be
834857
returned.
835858
"""
836859
check_is_fitted(self)
@@ -879,7 +902,7 @@ def transform(self, X):
879902
shape=(n_samples, feature_indices[-1]),
880903
dtype=self.dtype,
881904
)
882-
if not self.sparse:
905+
if not self.sparse_output:
883906
return out.toarray()
884907
else:
885908
return out

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,15 +139,17 @@ def test_encode_options():
139139
assert not sp.issparse(Xt_2)
140140
assert_array_equal(
141141
OneHotEncoder(
142-
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False
142+
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False
143143
).fit_transform(Xt_1),
144144
Xt_2,
145145
)
146146
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode="onehot").fit(X)
147147
Xt_3 = est.transform(X)
148148
assert sp.issparse(Xt_3)
149149
assert_array_equal(
150-
OneHotEncoder(categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True)
150+
OneHotEncoder(
151+
categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True
152+
)
151153
.fit_transform(Xt_1)
152154
.toarray(),
153155
Xt_3.toarray(),

0 commit comments

Comments
 (0)