Open
Description
Describe the bug
_safe_indexing
fails with pyarrow==16.0.0 because filter()
expects a pyarrow boolean type and cannot handle getting a numpy boolean array or a list passed.
I found apache/arrow#42013 addressing this and it was fixed for version 17.0.0. Upgrading my pyarrow version has resolved the issue for me.
We accept pyarrow==12.0.0 as a minimum (optional) dependency.
In the CI, we test in pylatest_conda_forge_mkl_linux
with pyarrow==20.0.0 (only).
Steps/Code to Reproduce
Run test_safe_indexing_1d_container_mask
.
Expected Results
no errors
Actual Results
Traceback:
array_type = 'pyarrow_array', indices_type = 'series'
@pytest.mark.parametrize(
"array_type", ["list", "array", "series", "polars_series", "pyarrow_array"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
> subset = _safe_indexing(array, indices, axis=0)
sklearn/utils/tests/test_indexing.py:229:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
sklearn/utils/_indexing.py:323: in _safe_indexing
return _pyarrow_indexing(X, indices, indices_dtype, axis=axis)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
X = <pyarrow.lib.Int64Array object at 0x7f08c2789c60>
[
1,
2,
3,
4,
5,
6,
7,
8,
9
]
key = array([False, True, True, False, False, False, False, False, False]), key_dtype = 'bool', axis = 0
def _pyarrow_indexing(X, key, key_dtype, axis):
"""Index a pyarrow data."""
scalar_key = np.isscalar(key)
if isinstance(key, slice):
if isinstance(key.stop, str):
start = X.column_names.index(key.start)
stop = X.column_names.index(key.stop) + 1
else:
start = 0 if not key.start else key.start
stop = key.stop
step = 1 if not key.step else key.step
key = list(range(start, stop, step))
if axis == 1:
# Here we are certain that X is a pyarrow Table or RecordBatch.
if key_dtype == "int" and not isinstance(key, list):
# pyarrow's X.select behavior is more consistent with integer lists.
key = np.asarray(key).tolist()
if key_dtype == "bool":
key = np.asarray(key).nonzero()[0].tolist()
if scalar_key:
return X.column(key)
return X.select(key)
# axis == 0 from here on
if scalar_key:
if hasattr(X, "shape"):
# X is a Table or RecordBatch
key = [key]
else:
return X[key].as_py()
elif not isinstance(key, list):
key = np.asarray(key)
if key_dtype == "bool":
> X_indexed = X.filter(key)
E TypeError: Argument 'mask' has incorrect type (expected pyarrow.lib.Array, got numpy.ndarray)
sklearn/utils/_indexing.py:134: TypeError
Versions
Python dependencies:
sklearn: 1.8.dev0
pip: 25.1.1
setuptools: 69.5.1
numpy: 2.1.3
scipy: 1.15.0
Cython: 3.0.10
pandas: 2.2.2
matplotlib: 3.9.2
joblib: 1.4.0
threadpoolctl: 3.4.0
Built with OpenMP: True
threadpoolctl info:
user_api: openmp
internal_api: openmp
num_threads: 14
prefix: libgomp
filepath: /usr/lib/libgomp.so.1.0.0
version: None
And pyarrow==16.0.0.