-
-
Notifications
You must be signed in to change notification settings - Fork 26.5k
Closed
Labels
Description
Describe the bug
I would like to use pandas transform output and use a custom transformer in a feature union which aggregates data. When I'm using this combination I got an error. When I use default numpy output it works fine.
Steps/Code to Reproduce
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
from sklearn.pipeline import make_union
index = pd.date_range(start="2020-01-01", end="2020-01-05", inclusive="left", freq="H")
data = pd.DataFrame(index=index, data=[10] * len(index), columns=["value"])
data["date"] = index.date
class MyTransformer(BaseEstimator, TransformerMixin):
def fit(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs):
return self
def transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame:
return X["value"].groupby(X["date"]).sum()
# This works.
set_config(transform_output="default")
print(make_union(MyTransformer()).fit_transform(data))
# This does not work.
set_config(transform_output="pandas")
print(make_union(MyTransformer()).fit_transform(data))Expected Results
No error is thrown when using pandas transform output.
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[5], line 25
23 # This does not work.
24 set_config(transform_output="pandas")
---> 25 print(make_union(MyTransformer()).fit_transform(data))
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:150, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
143 if isinstance(data_to_wrap, tuple):
144 # only wrap the first output for cross decomposition
145 return (
146 _wrap_data_with_container(method, data_to_wrap[0], X, self),
147 *data_to_wrap[1:],
148 )
--> 150 return _wrap_data_with_container(method, data_to_wrap, X, self)
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:130, in _wrap_data_with_container(method, data_to_wrap, original_input, estimator)
127 return data_to_wrap
129 # dense_config == "pandas"
--> 130 return _wrap_in_pandas_container(
131 data_to_wrap=data_to_wrap,
132 index=getattr(original_input, "index", None),
133 columns=estimator.get_feature_names_out,
134 )
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:59, in _wrap_in_pandas_container(data_to_wrap, columns, index)
57 data_to_wrap.columns = columns
58 if index is not None:
---> 59 data_to_wrap.index = index
60 return data_to_wrap
62 return pd.DataFrame(data_to_wrap, index=index, columns=columns)
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)
5586 try:
5587 object.__getattribute__(self, name)
-> 5588 return object.__setattr__(self, name, value)
5589 except AttributeError:
5590 pass
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)
767 def _set_axis(self, axis: int, labels: Index) -> None:
768 labels = ensure_index(labels)
--> 769 self._mgr.set_axis(axis, labels)
770 self._clear_item_cache()
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)
212 def set_axis(self, axis: int, new_labels: Index) -> None:
213 # Caller is responsible for ensuring we have an Index object.
--> 214 self._validate_set_axis(axis, new_labels)
215 self.axes[axis] = new_labels
File ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)
66 pass
68 elif new_len != old_len:
---> 69 raise ValueError(
70 f"Length mismatch: Expected axis has {old_len} elements, new "
71 f"values have {new_len} elements"
72 )
ValueError: Length mismatch: Expected axis has 4 elements, new values have 96 elementsVersions
System:
python: 3.10.6 (main, Aug 30 2022, 05:11:14) [Clang 13.0.0 (clang-1300.0.29.30)]
executable: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/bin/python
machine: macOS-11.3-x86_64-i386-64bit
Python dependencies:
sklearn: 1.2.1
pip: 22.3.1
setuptools: 67.3.2
numpy: 1.23.5
scipy: 1.10.1
Cython: None
pandas: 1.4.4
matplotlib: 3.7.0
joblib: 1.2.0
threadpoolctl: 3.1.0
Built with OpenMP: True
threadpoolctl info:
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
version: 0.3.20
threading_layer: pthreads
architecture: Haswell
num_threads: 4
user_api: openmp
internal_api: openmp
prefix: libomp
filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/.dylibs/libomp.dylib
version: None
num_threads: 8
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib
version: 0.3.18
threading_layer: pthreads
architecture: Haswell
num_threads: 4