diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 09beb091cc56f..f6072d1023d7f 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -220,6 +220,11 @@ Changelog - |Enhancement| :func:`datasets.dump_svmlight_file` is now accelerated with a Cython implementation, providing 2-4x speedups. :pr:`23127` by :user:`Meekail Zain ` + +- |Enhancement| Path-like objects, such as those created with pathlib are now + allowed as paths in :func:`datasets.load_svmlight_file` and + :func:`datasets.load_svmlight_files`. + :pr:`19075` by :user:`Carlos Ramos CarreƱo ` :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 5196f2088439b..16aae0de4f2b0 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -85,12 +85,15 @@ def load_svmlight_file( Parameters ---------- - f : str, file-like or int + f : str, path-like, file-like or int (Path to) a file to load. If a path ends in ".gz" or ".bz2", it will be uncompressed on the fly. If an integer is passed, it is assumed to be a file descriptor. A file-like or file descriptor will not be closed by this function. A file-like object must be opened in binary mode. + .. versionchanged:: 1.2 + Path-like objects are now accepted. + n_features : int, default=None The number of features to use. If None, it will be inferred. This argument is useful to load several files that are subsets of a @@ -182,8 +185,10 @@ def get_data(): def _gen_open(f): if isinstance(f, int): # file descriptor return io.open(f, "rb", closefd=False) + elif isinstance(f, os.PathLike): + f = os.fspath(f) elif not isinstance(f, str): - raise TypeError("expected {str, int, file-like}, got %s" % type(f)) + raise TypeError("expected {str, int, path-like, file-like}, got %s" % type(f)) _, ext = os.path.splitext(f) if ext == ".gz": @@ -249,13 +254,16 @@ def load_svmlight_files( Parameters ---------- - files : array-like, dtype=str, file-like or int + files : array-like, dtype=str, path-like, file-like or int (Paths of) files to load. If a path ends in ".gz" or ".bz2", it will be uncompressed on the fly. If an integer is passed, it is assumed to be a file descriptor. File-likes and file descriptors will not be closed by this function. File-like objects must be opened in binary mode. + .. versionchanged:: 1.2 + Path-like objects are now accepted. + n_features : int, default=None The number of features to use. If None, it will be inferred from the maximum column index occurring in any of the files. diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 6a85faf82b8a2..5d27aefea54c3 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -11,7 +11,7 @@ import pytest from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_array_almost_equal, assert_allclose from sklearn.utils._testing import fails_if_pypy import sklearn @@ -89,6 +89,16 @@ def test_load_svmlight_file_fd(): os.close(fd) +def test_load_svmlight_pathlib(): + # test loading from file descriptor + with resources.path(TEST_DATA_MODULE, datafile) as data_path: + X1, y1 = load_svmlight_file(str(data_path)) + X2, y2 = load_svmlight_file(data_path) + + assert_allclose(X1.data, X2.data) + assert_allclose(y1, y2) + + def test_load_svmlight_file_multilabel(): X, y = _load_svmlight_local_test_file(multifile, multilabel=True) assert y == [(0, 1), (2,), (), (1, 2)]