scikit-learn · nithish08 · May 27, 2021 · May 27, 2021 · May 27, 2021 · May 27, 2021
diff --git a/sklearn/__check_build/__init__.py b/sklearn/__check_build/__init__.py
@@ -2,6 +2,7 @@
 compile scikit-learn properly.
 """
 import os
+from importlib import resources
 
 INPLACE_MSG = """
 It appears that you are importing a local scikit-learn source tree. For
@@ -16,7 +17,8 @@
 def raise_build_error(e):
     # Raise a comprehensible error and list the contents of the
     # directory to help debugging on the mailing list.
-    local_dir = os.path.split(__file__)[0]
+    with resources.path('sklearn', '.') as f:
+        local_dir = f.as_posix()
     msg = STANDARD_MSG
     if local_dir == "sklearn/__check_build":
         # Picking up the local install: this will work only if the

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
@@ -12,7 +12,8 @@
 import shutil
 from collections import namedtuple
 from os import environ, listdir, makedirs
-from os.path import dirname, expanduser, isdir, join, splitext
+from os.path import expanduser, isdir, join, splitext
+from importlib import resources
 
 from ..utils import Bunch
 from ..utils import check_random_state
@@ -348,7 +349,8 @@ def load_wine(*, return_X_y=False, as_frame=False):
     >>> list(data.target_names)
     ['class_0', 'class_1', 'class_2']
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     data, target, target_names = load_data(module_path, 'wine_data.csv')
 
     with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file:
@@ -471,7 +473,8 @@ def load_iris(*, return_X_y=False, as_frame=False):
     >>> list(data.target_names)
     ['setosa', 'versicolor', 'virginica']
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     data, target, target_names = load_data(module_path, 'iris.csv')
     iris_csv_filename = join(module_path, 'data', 'iris.csv')
 
@@ -582,7 +585,8 @@ def load_breast_cancer(*, return_X_y=False, as_frame=False):
     >>> list(data.target_names)
     ['malignant', 'benign']
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     data, target, target_names = load_data(module_path, 'breast_cancer.csv')
     csv_filename = join(module_path, 'data', 'breast_cancer.csv')
 
@@ -709,7 +713,8 @@ def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
         >>> plt.matshow(digits.images[0]) #doctest: +SKIP
         >>> plt.show() #doctest: +SKIP
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     data = np.loadtxt(join(module_path, 'data', 'digits.csv.gz'),
                       delimiter=',')
     with open(join(module_path, 'descr', 'digits.rst')) as f:
@@ -758,7 +763,7 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
     Features         real, -.2 < x < .2
     Targets          integer 25 - 346
     ==============   ==================
-    
+
     .. note::
        The meaning of each feature (i.e. `feature_names`) might be unclear
        (especially for `ltg`) as the documentation of the original dataset is
@@ -813,7 +818,8 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
 
         .. versionadded:: 0.18
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     base_dir = join(module_path, 'data')
     data_filename = join(base_dir, 'diabetes_data.csv.gz')
     data = np.loadtxt(data_filename)
@@ -911,7 +917,8 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
 
         .. versionadded:: 0.18
     """
-    base_dir = join(dirname(__file__), 'data/')
+    with resources.path('sklearn.datasets', 'data') as f:
+        base_dir = f.as_posix()
     data_filename = join(base_dir, 'linnerud_exercise.csv')
     target_filename = join(base_dir, 'linnerud_physiological.csv')
 
@@ -925,7 +932,9 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
     with open(target_filename) as f:
         header_physiological = f.readline().split()
 
-    with open(dirname(__file__) + '/descr/linnerud.rst') as f:
+    with resources.path('sklearn.datasets', 'descr') as f:
+        module_path = f.as_posix()
+    with open(join(module_path, 'linnerud.rst')) as f:
         descr = f.read()
 
     frame = None
@@ -1005,7 +1014,8 @@ def load_boston(*, return_X_y=False):
     >>> print(X.shape)
     (506, 13)
     """
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
 
     fdescr_name = join(module_path, 'descr', 'boston_house_prices.rst')
     with open(fdescr_name) as f:
@@ -1073,7 +1083,8 @@ def load_sample_images():
     # import PIL only when needed
     from ..externals._pilutil import imread
 
-    module_path = join(dirname(__file__), "images")
+    with resources.path('sklearn.datasets', 'images') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'README.txt')) as f:
         descr = f.read()
     filenames = [join(module_path, filename)
@@ -1082,7 +1093,7 @@ def load_sample_images():
     # Load image data for each image in the source folder.
     images = [imread(filename) for filename in filenames]
 
-    return Bunch(images=images,
+    return Bunch(images,
                  filenames=filenames,
                  DESCR=descr)
 

diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
@@ -21,9 +21,10 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from os.path import dirname, exists, join
+from os.path import exists, join
 from os import makedirs, remove
 import tarfile
+from importlib import resources
 
 import numpy as np
 import logging
@@ -163,7 +164,8 @@ def fetch_california_housing(*, data_home=None, download_if_missing=True,
     # target in units of 100,000
     target = target / 100000.0
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
         descr = dfile.read()
 

diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
@@ -16,8 +16,9 @@
 
 from gzip import GzipFile
 import logging
-from os.path import dirname, exists, join
+from os.path import exists, join
 from os import remove, makedirs
+from importlib import resources
 
 import numpy as np
 import joblib
@@ -170,7 +171,8 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
         X = X[ind]
         y = y[ind]
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'covtype.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
@@ -12,7 +12,8 @@
 from gzip import GzipFile
 import logging
 import os
-from os.path import dirname, exists, join
+from os.path import exists, join
+from importlib import resources
 
 import numpy as np
 import joblib
@@ -193,7 +194,8 @@ def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False,
     if shuffle:
         data, target = shuffle_method(data, target, random_state=random_state)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'kddcup99.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
@@ -9,13 +9,14 @@
 # License: BSD 3 clause
 
 from os import listdir, makedirs, remove
-from os.path import dirname, join, exists, isdir
+from os.path import join, exists, isdir
 
 import logging
 
 import numpy as np
 import joblib
 from joblib import Memory
+from importlib import resources
 
 from ._base import get_data_home, _fetch_remote, RemoteFileMetadata
 from ..utils import Bunch
@@ -316,7 +317,8 @@ def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5,
 
     X = faces.reshape(len(faces), -1)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'lfw.rst')) as rst_file:
         fdescr = rst_file.read()
 
@@ -497,7 +499,8 @@ def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True,
         index_file_path, data_folder_path, resize=resize, color=color,
         slice_=slice_)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'lfw.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py
@@ -13,12 +13,13 @@
 # Copyright (c) 2011 David Warde-Farley <wardefar at iro dot umontreal dot ca>
 # License: BSD 3 clause
 
-from os.path import dirname, exists, join
+from os.path import exists, join
 from os import makedirs, remove
 
 import numpy as np
 from scipy.io.matlab import loadmat
 import joblib
+from importlib import resources
 
 from . import get_data_home
 from ._base import _fetch_remote
@@ -132,7 +133,8 @@ def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0,
         target = target[order]
     faces_vectorized = faces.reshape(len(faces), -1)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'olivetti_faces.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py
@@ -11,8 +11,9 @@
 import logging
 
 from os import remove, makedirs
-from os.path import dirname, exists, join
+from os.path import exists, join
 from gzip import GzipFile
+from importlib import resources
 
 import numpy as np
 import scipy.sparse as sp
@@ -260,7 +261,8 @@ def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True,
     if shuffle:
         X, y, sample_id = shuffle_(X, y, sample_id, random_state=random_state)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'rcv1.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
@@ -25,13 +25,14 @@
 # License: BSD 3 clause
 
 import os
-from os.path import dirname, join
+from os.path import join
 import logging
 import tarfile
 import pickle
 import shutil
 import re
 import codecs
+from importlib import resources
 
 import numpy as np
 import scipy.sparse as sp
@@ -278,7 +279,8 @@ def fetch_20newsgroups(*, data_home=None, subset='train', categories=None,
         raise ValueError(
             "subset can only be 'train', 'test' or 'all', got '%s'" % subset)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file:
         fdescr = rst_file.read()
 
@@ -488,7 +490,8 @@ def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None,
         raise ValueError("%r is not a valid subset: should be one of "
                          "['train', 'test', 'all']" % subset)
 
-    module_path = dirname(__file__)
+    with resources.path('sklearn', 'datasets') as f:
+        module_path = f.as_posix()
     with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file:
         fdescr = rst_file.read()
 

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -6,6 +6,7 @@
 import os
 import re
 from io import BytesIO
+from importlib import resources  # type: ignore
 
 import numpy as np
 import scipy.sparse
@@ -30,8 +31,8 @@
 from functools import partial
 from sklearn.utils._testing import fails_if_pypy
 
-
-currdir = os.path.dirname(os.path.abspath(__file__))
+with resources.path('sklearn.datasets', 'tests') as f:
+    currdir = f.as_posix()
 # if True, urlopen will be monkey patched to only use local files
 test_offline = True
 

diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py
@@ -6,6 +6,7 @@
 import os
 import shutil
 from tempfile import NamedTemporaryFile
+from importlib import resources  # type: ignore
 
 import pytest
 
@@ -17,7 +18,8 @@
 from sklearn.datasets import (load_svmlight_file, load_svmlight_files,
                               dump_svmlight_file)
 
-currdir = os.path.dirname(os.path.abspath(__file__))
+with resources.path('sklearn.datasets', 'tests') as f:
+    currdir = f.as_posix()
 datafile = os.path.join(currdir, "data", "svmlight_classification.txt")
 multifile = os.path.join(currdir, "data", "svmlight_multilabel.txt")
 invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt")

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -3,7 +3,7 @@
 """
 import pkgutil
 import inspect
-from importlib import import_module
+from importlib import import_module, resources
 from operator import itemgetter
 from collections.abc import Sequence
 from contextlib import contextmanager
@@ -13,7 +13,6 @@
 import platform
 import struct
 import timeit
-from pathlib import Path
 
 import warnings
 import numpy as np
@@ -1127,7 +1126,8 @@ def is_abstract(c):
 
     all_classes = []
     modules_to_ignore = {"tests", "externals", "setup", "conftest"}
-    root = str(Path(__file__).parent.parent)  # sklearn package
+    with resources.path('sklearn', '.') as f:
+        root = f.as_posix()  # sklearn package
     # Ignore deprecation warnings triggered at import time and from walking
     # packages
     with ignore_warnings(category=FutureWarning):