From 1062baee2f9d80503e60ff8d927b58cd7aa4cfb4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 28 Dec 2020 17:44:20 +0100 Subject: [PATCH 01/11] Allow path-like objects in load_svmlight_file. Add also a test for this functionality. --- sklearn/datasets/_svmlight_format_io.py | 10 +++++++--- sklearn/datasets/tests/test_svmlight_format.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 8997624da0755..403d42d7dd580 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -74,7 +74,7 @@ def load_svmlight_file(f, *, n_features=None, dtype=np.float64, Parameters ---------- - f : str, file-like or int + f : str, path-like, file-like or int (Path to) a file to load. If a path ends in ".gz" or ".bz2", it will be uncompressed on the fly. If an integer is passed, it is assumed to be a file descriptor. A file-like or file descriptor will not be closed @@ -164,8 +164,12 @@ def get_data(): def _gen_open(f): if isinstance(f, int): # file descriptor return io.open(f, "rb", closefd=False) + elif isinstance(f, os.PathLike): + f = os.fspath(f) elif not isinstance(f, str): - raise TypeError("expected {str, int, file-like}, got %s" % type(f)) + raise TypeError( + "expected {str, int, path-like, file-like}, got %s" % type(f) + ) _, ext = os.path.splitext(f) if ext == ".gz": @@ -223,7 +227,7 @@ def load_svmlight_files(files, *, n_features=None, dtype=np.float64, Parameters ---------- - files : array-like, dtype=str, file-like or int + files : array-like, dtype=str, path-like, file-like or int (Paths of) files to load. If a path ends in ".gz" or ".bz2", it will be uncompressed on the fly. If an integer is passed, it is assumed to be a file descriptor. File-likes and file descriptors will not be diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 336069c1c8251..71a896dd4a00c 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -4,6 +4,7 @@ import numpy as np import scipy.sparse as sp import os +import pathlib import shutil from tempfile import NamedTemporaryFile @@ -70,6 +71,15 @@ def test_load_svmlight_file_fd(): os.close(fd) +def test_load_svmlight_pathlib(): + # test loading from file descriptor + X1, y1 = load_svmlight_file(datafile) + X2, y2 = load_svmlight_file(pathlib.Path(datafile)) + + assert_array_almost_equal(X1.data, X2.data) + assert_array_almost_equal(y1, y2) + + def test_load_svmlight_file_multilabel(): X, y = load_svmlight_file(multifile, multilabel=True) assert y == [(0, 1), (2,), (), (1, 2)] From c4e0e5739560e047921999918744b4e09882f31c Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 28 Dec 2020 18:05:04 +0100 Subject: [PATCH 02/11] Fix linter error. --- sklearn/datasets/tests/test_svmlight_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 71a896dd4a00c..3199e458ba19a 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -75,7 +75,7 @@ def test_load_svmlight_pathlib(): # test loading from file descriptor X1, y1 = load_svmlight_file(datafile) X2, y2 = load_svmlight_file(pathlib.Path(datafile)) - + assert_array_almost_equal(X1.data, X2.data) assert_array_almost_equal(y1, y2) From 476d2a5b6e7364f7d06f16517c8ebe40370c75cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 19 Aug 2022 13:40:53 +0200 Subject: [PATCH 03/11] Update what's new Add what's new line for the PR. --- doc/whats_new/v1.2.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 1960cda3459b0..d6a8757e5603f 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -155,6 +155,9 @@ Changelog - |Enhancement| :func:`datasets.dump_svmlight_file` is now accelerated with a Cython implementation, providing 2-4x speedups. :pr:`23127` by :user:`Meekail Zain ` + +- |Enhancement| Path-like objects, such as those created with pathlib are now allowed as paths in :func:`load_svmlight_file` and :func:`load_svmlight_files`. + :pr:`19075` by :user:`Carlos Ramos Carreño ` :mod:`sklearn.decomposition` ............................ From 38d2ed2469b9b2f84ab7d4fb83b2f72a79557108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 19 Aug 2022 13:42:45 +0200 Subject: [PATCH 04/11] Add missing module name. --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index d6a8757e5603f..6780bc4f5741b 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -156,7 +156,7 @@ Changelog Cython implementation, providing 2-4x speedups. :pr:`23127` by :user:`Meekail Zain ` -- |Enhancement| Path-like objects, such as those created with pathlib are now allowed as paths in :func:`load_svmlight_file` and :func:`load_svmlight_files`. +- |Enhancement| Path-like objects, such as those created with pathlib are now allowed as paths in :func:`datasets.load_svmlight_file` and :func:`datasets.load_svmlight_files`. :pr:`19075` by :user:`Carlos Ramos Carreño ` :mod:`sklearn.decomposition` From ba4c422bf6b555bd9830531147fad61ee6ee3bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 19 Aug 2022 15:57:01 +0200 Subject: [PATCH 05/11] Fix black error (against PEP8, it seems) --- sklearn/datasets/_svmlight_format_io.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 959a8f3def5b1..999c4c3df2037 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -185,9 +185,7 @@ def _gen_open(f): elif isinstance(f, os.PathLike): f = os.fspath(f) elif not isinstance(f, str): - raise TypeError( - "expected {str, int, path-like, file-like}, got %s" % type(f) - ) + raise TypeError("expected {str, int, path-like, file-like}, got %s" % type(f)) _, ext = os.path.splitext(f) if ext == ".gz": From 958de91bbd230553174d70f5ca89117b0df909c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 19 Aug 2022 16:30:03 +0200 Subject: [PATCH 06/11] Use import resources --- sklearn/datasets/tests/test_svmlight_format.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index c9198f5769971..1ed0f7759043f 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -92,8 +92,9 @@ def test_load_svmlight_file_fd(): def test_load_svmlight_pathlib(): # test loading from file descriptor - X1, y1 = load_svmlight_file(datafile) - X2, y2 = load_svmlight_file(pathlib.Path(datafile)) + with resources.path(TEST_DATA_MODULE, datafile) as data_path: + X1, y1 = load_svmlight_file(str(data_path)) + X2, y2 = load_svmlight_file(data_path) assert_array_almost_equal(X1.data, X2.data) assert_array_almost_equal(y1, y2) From 236f5ec6b287b7cbe1f6ec39d10debd34ecba71f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 19 Aug 2022 17:12:40 +0200 Subject: [PATCH 07/11] Remove unused import --- sklearn/datasets/tests/test_svmlight_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 1ed0f7759043f..aa869738ac9ff 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -4,7 +4,6 @@ import numpy as np import scipy.sparse as sp import os -import pathlib import shutil from importlib import resources from tempfile import NamedTemporaryFile From a4dd4cc9f6a4a25b03421a13629e7aaa9be572f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 22 Aug 2022 19:39:06 +0200 Subject: [PATCH 08/11] Update sklearn/datasets/tests/test_svmlight_format.py Co-authored-by: Thomas J. Fan --- sklearn/datasets/tests/test_svmlight_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index aa869738ac9ff..6038cb91121d7 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -95,8 +95,8 @@ def test_load_svmlight_pathlib(): X1, y1 = load_svmlight_file(str(data_path)) X2, y2 = load_svmlight_file(data_path) - assert_array_almost_equal(X1.data, X2.data) - assert_array_almost_equal(y1, y2) + assert_allclose(X1.data, X2.data) + assert_allclose(y1, y2) def test_load_svmlight_file_multilabel(): From 7166bb000f54c6fd19201a4c3806703e676e1da9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 22 Aug 2022 19:40:43 +0200 Subject: [PATCH 09/11] Wrap change comment. --- doc/whats_new/v1.2.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 6780bc4f5741b..3b18a4678c152 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -156,7 +156,9 @@ Changelog Cython implementation, providing 2-4x speedups. :pr:`23127` by :user:`Meekail Zain ` -- |Enhancement| Path-like objects, such as those created with pathlib are now allowed as paths in :func:`datasets.load_svmlight_file` and :func:`datasets.load_svmlight_files`. +- |Enhancement| Path-like objects, such as those created with pathlib are now + allowed as paths in :func:`datasets.load_svmlight_file` and + :func:`datasets.load_svmlight_files`. :pr:`19075` by :user:`Carlos Ramos Carreño ` :mod:`sklearn.decomposition` From 2a270be00b5dd9ecdd8073912c413fd6ae2031b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 22 Aug 2022 23:08:26 +0200 Subject: [PATCH 10/11] Add versionchanged directives --- sklearn/datasets/_svmlight_format_io.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 999c4c3df2037..16aae0de4f2b0 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -91,6 +91,9 @@ def load_svmlight_file( be a file descriptor. A file-like or file descriptor will not be closed by this function. A file-like object must be opened in binary mode. + .. versionchanged:: 1.2 + Path-like objects are now accepted. + n_features : int, default=None The number of features to use. If None, it will be inferred. This argument is useful to load several files that are subsets of a @@ -258,6 +261,9 @@ def load_svmlight_files( closed by this function. File-like objects must be opened in binary mode. + .. versionchanged:: 1.2 + Path-like objects are now accepted. + n_features : int, default=None The number of features to use. If None, it will be inferred from the maximum column index occurring in any of the files. From 8e33f9d1ccd22d57cc50fe02142c930183657d22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Thu, 1 Sep 2022 13:11:52 +0200 Subject: [PATCH 11/11] Fix tests --- sklearn/datasets/tests/test_svmlight_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 6038cb91121d7..5d27aefea54c3 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -11,7 +11,7 @@ import pytest from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_array_almost_equal, assert_allclose from sklearn.utils._testing import fails_if_pypy import sklearn