Closed
Description
Describe the bug
Having both OrdinalEncoder
and OneHotEncoder
inside the parameters grid to be used by the GridSearchCV
or RandomizedSearchCV
results in the following error: TypeError: float() argument must be a string or a real number, not 'OneHotEncoder'
.
Steps/Code to Reproduce
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
set_config(transform_output="pandas")
# Setting seed for reproducibility
np.random.seed(42)
# Create a DataFrame with 1000 rows and 5 columns
num_rows = 1000
data = {
"numeric_1": np.random.randn(num_rows), # Normally distributed random numbers
"numeric_3": np.random.randint(
1, 100, size=num_rows
), # Random integers between 1 and 100
"object_1": np.random.choice(
["A", "B", "C", "D"], size=num_rows
), # Random choice among 'A', 'B', 'C', 'D'
"object_2": np.random.choice(
["X", "Y", "Z"], size=num_rows
), # Random choice among 'X', 'Y', 'Z'
"target": np.random.rand(num_rows)
* 100, # Uniformly distributed random numbers [0, 100)
}
df = pd.DataFrame(data)
X = df.drop("target", axis=1)
y = df["target"]
enc = ColumnTransformer(
[("enc", OneHotEncoder(sparse_output=False), ["object_1", "object_2"])],
remainder="passthrough",
verbose_feature_names_out=False,
)
pipe = Pipeline(
[
("enc", enc),
("regressor", HistGradientBoostingRegressor()),
]
)
grid_params = {
"enc__enc": [
OneHotEncoder(sparse_output=False),
OrdinalEncoder(),
]
}
grid_search = GridSearchCV(pipe, grid_params, cv=5)
grid_search.fit(X, y)
# RandomizedSearchCV produces the same error
# rand_search = RandomizedSearchCV(pipe, grid_params, cv=5)
# rand_search.fit(X, y)
Expected Results
I would have expected the pipeline to run without errors, like that:
Actual Results
{
"name": "TypeError",
"message": "float() argument must be a string or a real number, not 'OneHotEncoder'",
"stack": "---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[108], line 1
----> 1 grid_search.fit(X, y)
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:968, in BaseSearchCV.fit(self, X, y, **params)
962 results = self._format_results(
963 all_candidate_params, n_splits, all_out, all_more_results
964 )
966 return results
--> 968 self._run_search(evaluate_candidates)
970 # multimetric is determined here because in the case of a callable
971 # self.scoring the return type is only known after calling
972 first_test_score = all_out[0][\"test_scores\"]
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1543, in GridSearchCV._run_search(self, evaluate_candidates)
1541 def _run_search(self, evaluate_candidates):
1542 \"\"\"Search all candidates in param_grid\"\"\"
-> 1543 evaluate_candidates(ParameterGrid(self.param_grid))
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:962, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
959 all_more_results[key].extend(value)
961 nonlocal results
--> 962 results = self._format_results(
963 all_candidate_params, n_splits, all_out, all_more_results
964 )
966 return results
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:1098, in BaseSearchCV._format_results(self, candidate_params, n_splits, out, more_results)
1094 arr_dtype = object
1095 if len(param_list) == n_candidates and arr_dtype != object:
1096 # Exclude `object` else the numpy constructor might infer a list of
1097 # tuples to be a 2d array.
-> 1098 results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
1099 else:
1100 # Use one MaskedArray and mask all the places where the param is not
1101 # applicable for that candidate (which may not contain all the params).
1102 ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
File ~/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/numpy/ma/core.py:2820, in MaskedArray.__new__(cls, data, mask, dtype, copy, subok, ndmin, fill_value, keep_mask, hard_mask, shrink, order)
2811 \"\"\"
2812 Create a new masked array from scratch.
2813
(...)
2817
2818 \"\"\"
2819 # Process data.
-> 2820 _data = np.array(data, dtype=dtype, copy=copy,
2821 order=order, subok=True, ndmin=ndmin)
2822 _baseclass = getattr(data, '_baseclass', type(_data))
2823 # Check that we're not erasing the mask.
TypeError: float() argument must be a string or a real number, not 'OneHotEncoder'"
}
Versions
System:
python: 3.12.3 (main, Apr 9 2024, 08:09:14) [Clang 15.0.0 (clang-1500.3.9.4)]
executable: /Users/brice/Documents/Coding/ML_exercises/.venv/bin/python
machine: macOS-14.5-arm64-arm-64bit
Python dependencies:
sklearn: 1.5.0
pip: 24.0
setuptools: 70.0.0
numpy: 1.26.4
scipy: 1.13.1
Cython: None
pandas: 2.2.2
matplotlib: 3.9.0
joblib: 1.4.2
threadpoolctl: 3.5.0
Built with OpenMP: True
threadpoolctl info:
user_api: blas
internal_api: openblas
num_threads: 8
prefix: libopenblas
filepath: /Users/brice/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
version: 0.3.23.dev
threading_layer: pthreads
architecture: armv8
user_api: blas
internal_api: openblas
num_threads: 8
prefix: libopenblas
filepath: /Users/brice/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/scipy/.dylibs/libopenblas.0.dylib
version: 0.3.27
threading_layer: pthreads
architecture: neoversen1
user_api: openmp
internal_api: openmp
num_threads: 8
prefix: libomp
filepath: /Users/brice/Documents/Coding/ML_exercises/.venv/lib/python3.12/site-packages/sklearn/.dylibs/libomp.dylib
version: None