scikit-learn · adrinjalali · Oct 10, 2022 · Oct 7, 2022 · Oct 9, 2022 · Oct 10, 2022
diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py
@@ -3,8 +3,9 @@
 including methods to load and fetch popular reference datasets. It also
 features some artificial data generators.
 """
+import textwrap
+
 from ._base import load_breast_cancer
-from ._base import load_boston
 from ._base import load_diabetes
 from ._base import load_digits
 from ._base import load_files
@@ -66,7 +67,6 @@
     "fetch_kddcup99",
     "fetch_openml",
     "get_data_home",
-    "load_boston",
     "load_diabetes",
     "load_digits",
     "load_files",
@@ -99,3 +99,63 @@
     "make_spd_matrix",
     "make_swiss_roll",
 ]
+
+
+def __getattr__(name):
+    if name == "load_boston":
+        msg = textwrap.dedent(
+            """
+            `load_boston` has been removed from scikit-learn since version 1.2.
+
+            The Boston housing prices dataset has an ethical problem: as
+            investigated in [1], the authors of this dataset engineered a
+            non-invertible variable "B" assuming that racial self-segregation had a
+            positive impact on house prices [2]. Furthermore the goal of the
+            research that led to the creation of this dataset was to study the
+            impact of air quality but it did not give adequate demonstration of the
+            validity of this assumption.
+
+            The scikit-learn maintainers therefore strongly discourage the use of
+            this dataset unless the purpose of the code is to study and educate
+            about ethical issues in data science and machine learning.
+
+            In this special case, you can fetch the dataset from the original
+            source::
+
+                import pandas as pd
+                import numpy as np
+
+                data_url = "http://lib.stat.cmu.edu/datasets/boston"
+                raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
+                data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+                target = raw_df.values[1::2, 2]
+
+            Alternative datasets include the California housing dataset and the
+            Ames housing dataset. You can load the datasets as follows::
+
+                from sklearn.datasets import fetch_california_housing
+                housing = fetch_california_housing()
+
+            for the California housing dataset and::
+
+                from sklearn.datasets import fetch_openml
+                housing = fetch_openml(name="house_prices", as_frame=True)
+
+            for the Ames housing dataset.
+
+            [1] M Carlisle.
+            "Racist data destruction?"
+            <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>
+
+            [2] Harrison Jr, David, and Daniel L. Rubinfeld.
+            "Hedonic housing prices and the demand for clean air."
+            Journal of environmental economics and management 5.1 (1978): 81-102.
+            <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>
+            """
+        )
+        raise ImportError(msg)
+    try:
+        return globals()[name]
+    except KeyError:
+        # This is turned into the appropriate ImportError
+        raise AttributeError
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
@@ -21,7 +21,6 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_pandas_support
-from ..utils.deprecation import deprecated
 
 import numpy as np
 
@@ -1170,193 +1169,6 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
     )
 
 
-@deprecated(
-    r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2.
-
-    The Boston housing prices dataset has an ethical problem. You can refer to
-    the documentation of this function for further details.
-
-    The scikit-learn maintainers therefore strongly discourage the use of this
-    dataset unless the purpose of the code is to study and educate about
-    ethical issues in data science and machine learning.
-
-    In this special case, you can fetch the dataset from the original
-    source::
-
-        import pandas as pd
-        import numpy as np
-
-        data_url = "http://lib.stat.cmu.edu/datasets/boston"
-        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
-        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
-        target = raw_df.values[1::2, 2]
-
-    Alternative datasets include the California housing dataset (i.e.
-    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
-    dataset. You can load the datasets as follows::
-
-        from sklearn.datasets import fetch_california_housing
-        housing = fetch_california_housing()
-
-    for the California housing dataset and::
-
-        from sklearn.datasets import fetch_openml
-        housing = fetch_openml(name="house_prices", as_frame=True)
-
-    for the Ames housing dataset."""
-)
-def load_boston(*, return_X_y=False):
-    r"""Load and return the Boston house-prices dataset (regression).
-
-    ==============   ==============
-    Samples total               506
-    Dimensionality               13
-    Features         real, positive
-    Targets           real 5. - 50.
-    ==============   ==============
-
-    Read more in the :ref:`User Guide <boston_dataset>`.
-
-    .. warning::
-        The Boston housing prices dataset has an ethical problem: as
-        investigated in [1]_, the authors of this dataset engineered a
-        non-invertible variable "B" assuming that racial self-segregation had a
-        positive impact on house prices [2]_. Furthermore the goal of the
-        research that led to the creation of this dataset was to study the
-        impact of air quality but it did not give adequate demonstration of the
-        validity of this assumption.
-
-        The scikit-learn maintainers therefore strongly discourage the use of
-        this dataset unless the purpose of the code is to study and educate
-        about ethical issues in data science and machine learning.
-
-        In this special case, you can fetch the dataset from the original
-        source::
-
-            import pandas as pd  # doctest: +SKIP
-            import numpy as np
-
-            data_url = "http://lib.stat.cmu.edu/datasets/boston"
-            raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
-            data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
-            target = raw_df.values[1::2, 2]
-
-        Alternative datasets include the California housing dataset [3]_
-        (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames
-        housing dataset [4]_. You can load the datasets as follows::
-
-            from sklearn.datasets import fetch_california_housing
-            housing = fetch_california_housing()
-
-        for the California housing dataset and::
-
-            from sklearn.datasets import fetch_openml
-            housing = fetch_openml(name="house_prices", as_frame=True)
-
-        for the Ames housing dataset.
-
-    Parameters
-    ----------
-    return_X_y : bool, default=False
-        If True, returns ``(data, target)`` instead of a Bunch object.
-        See below for more information about the `data` and `target` object.
-
-        .. versionadded:: 0.18
-
-    Returns
-    -------
-    data : :class:`~sklearn.utils.Bunch`
-        Dictionary-like object, with the following attributes.
-
-        data : ndarray of shape (506, 13)
-            The data matrix.
-        target : ndarray of shape (506,)
-            The regression target.
-        filename : str
-            The physical location of boston csv dataset.
-
-            .. versionadded:: 0.20
-
-        DESCR : str
-            The full description of the dataset.
-        feature_names : ndarray
-            The names of features
-
-    (data, target) : tuple if ``return_X_y`` is True
-        A tuple of two ndarrays. The first contains a 2D array of shape (506, 13)
-        with each row representing one sample and each column representing the features.
-        The second array of shape (506,) contains the target samples.
-
-        .. versionadded:: 0.18
-
-    Notes
-    -----
-        .. versionchanged:: 0.20
-            Fixed a wrong data point at [445, 0].
-
-    References
-    ----------
-    .. [1] `Racist data destruction? M Carlisle,
-            <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>`_
-    .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.
-           "Hedonic housing prices and the demand for clean air."
-           Journal of environmental economics and management 5.1 (1978): 81-102.
-           <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>`_
-    .. [3] `California housing dataset
-            <https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset>`_
-    .. [4] `Ames housing dataset
-            <https://www.openml.org/d/42165>`_
-
-    Examples
-    --------
-    >>> import warnings
-    >>> from sklearn.datasets import load_boston
-    >>> with warnings.catch_warnings():
-    ...     # You should probably not use this dataset.
-    ...     warnings.filterwarnings("ignore")
-    ...     X, y = load_boston(return_X_y=True)
-    >>> print(X.shape)
-    (506, 13)
-    """
-    # TODO: once the deprecation period is over, implement a module level
-    # `__getattr__` function in`sklearn.datasets` to raise an exception with
-    # an informative error message at import time instead of just removing
-    # load_boston. The goal is to avoid having beginners that copy-paste code
-    # from numerous books and tutorials that use this dataset loader get
-    # a confusing ImportError when trying to learn scikit-learn.
-    # See: https://www.python.org/dev/peps/pep-0562/
-
-    descr_text = load_descr("boston_house_prices.rst")
-
-    data_file_name = "boston_house_prices.csv"
-    with resources.open_text(DATA_MODULE, data_file_name) as f:
-        data_file = csv.reader(f)
-        temp = next(data_file)
-        n_samples = int(temp[0])
-        n_features = int(temp[1])
-        data = np.empty((n_samples, n_features))
-        target = np.empty((n_samples,))
-        temp = next(data_file)  # names of features
-        feature_names = np.array(temp)
-
-        for i, d in enumerate(data_file):
-            data[i] = np.asarray(d[:-1], dtype=np.float64)
-            target[i] = np.asarray(d[-1], dtype=np.float64)
-
-    if return_X_y:
-        return data, target
-
-    return Bunch(
-        data=data,
-        target=target,
-        # last column is target value
-        feature_names=feature_names[:-1],
-        DESCR=descr_text,
-        filename=data_file_name,
-        data_module=DATA_MODULE,
-    )
-
-
 def load_sample_images():
     """Load sample images for image manipulation.
 

diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
@@ -19,15 +19,13 @@
 from sklearn.datasets import load_linnerud
 from sklearn.datasets import load_iris
 from sklearn.datasets import load_breast_cancer
-from sklearn.datasets import load_boston
 from sklearn.datasets import load_wine
 from sklearn.datasets._base import (
     load_csv_data,
     load_gzip_compressed_csv_data,
 )
 from sklearn.preprocessing import scale
 from sklearn.utils import Bunch
-from sklearn.utils._testing import SkipTest
 from sklearn.datasets.tests.test_common import check_as_frame
 
 
@@ -245,7 +243,6 @@ def test_load_diabetes_raw():
     )
 
 
-@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated")
 @pytest.mark.parametrize(
     "loader_func, data_shape, target_shape, n_target, has_descr, filenames",
     [
@@ -263,7 +260,6 @@ def test_load_diabetes_raw():
         (load_diabetes, (442, 10), (442,), None, True, []),
         (load_digits, (1797, 64), (1797,), 10, True, []),
         (partial(load_digits, n_class=9), (1617, 64), (1617,), 10, True, []),
-        (load_boston, (506, 13), (506,), None, True, ["filename"]),
     ],
 )
 def test_loader(loader_func, data_shape, target_shape, n_target, has_descr, filenames):
@@ -343,31 +339,13 @@ def test_bunch_dir():
     assert "data" in dir(data)
 
 
-# FIXME: to be removed in 1.2
-def test_load_boston_warning():
-    """Check that we raise the ethical warning when loading `load_boston`."""
-    warn_msg = "The Boston housing prices dataset has an ethical problem"
-    with pytest.warns(FutureWarning, match=warn_msg):
-        load_boston()
+def test_load_boston_error():
+    """Check that we raise the ethical warning when trying to import `load_boston`."""
+    msg = "The Boston housing prices dataset has an ethical problem"
+    with pytest.raises(ImportError, match=msg):
+        from sklearn.datasets import load_boston  # noqa
 
-
-@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated")
-def test_load_boston_alternative():
-    pd = pytest.importorskip("pandas")
-    if os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "1":
-        raise SkipTest(
-            "This test requires an internet connection to fetch the dataset."
-        )
-
-    boston_sklearn = load_boston()
-
-    data_url = "http://lib.stat.cmu.edu/datasets/boston"
-    try:
-        raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
-    except ConnectionError as e:
-        pytest.xfail(f"The dataset can't be downloaded. Got exception: {e}")
-    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
-    target = raw_df.values[1::2, 2]
-
-    np.testing.assert_allclose(data, boston_sklearn.data)
-    np.testing.assert_allclose(target, boston_sklearn.target)
+    # other non-existing function should raise the usual import error
+    msg = "cannot import name 'non_existing_function' from 'sklearn.datasets'"
+    with pytest.raises(ImportError, match=msg):
+        from sklearn.datasets import non_existing_function  # noqa