diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 42f7b2f12ac0e..7731fe84b8421 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -3,8 +3,9 @@ including methods to load and fetch popular reference datasets. It also features some artificial data generators. """ +import textwrap + from ._base import load_breast_cancer -from ._base import load_boston from ._base import load_diabetes from ._base import load_digits from ._base import load_files @@ -66,7 +67,6 @@ "fetch_kddcup99", "fetch_openml", "get_data_home", - "load_boston", "load_diabetes", "load_digits", "load_files", @@ -99,3 +99,63 @@ "make_spd_matrix", "make_swiss_roll", ] + + +def __getattr__(name): + if name == "load_boston": + msg = textwrap.dedent( + """ + `load_boston` has been removed from scikit-learn since version 1.2. + + The Boston housing prices dataset has an ethical problem: as + investigated in [1], the authors of this dataset engineered a + non-invertible variable "B" assuming that racial self-segregation had a + positive impact on house prices [2]. Furthermore the goal of the + research that led to the creation of this dataset was to study the + impact of air quality but it did not give adequate demonstration of the + validity of this assumption. + + The scikit-learn maintainers therefore strongly discourage the use of + this dataset unless the purpose of the code is to study and educate + about ethical issues in data science and machine learning. + + In this special case, you can fetch the dataset from the original + source:: + + import pandas as pd + import numpy as np + + data_url = "http://lib.stat.cmu.edu/datasets/boston" + raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None) + data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) + target = raw_df.values[1::2, 2] + + Alternative datasets include the California housing dataset and the + Ames housing dataset. You can load the datasets as follows:: + + from sklearn.datasets import fetch_california_housing + housing = fetch_california_housing() + + for the California housing dataset and:: + + from sklearn.datasets import fetch_openml + housing = fetch_openml(name="house_prices", as_frame=True) + + for the Ames housing dataset. + + [1] M Carlisle. + "Racist data destruction?" + + + [2] Harrison Jr, David, and Daniel L. Rubinfeld. + "Hedonic housing prices and the demand for clean air." + Journal of environmental economics and management 5.1 (1978): 81-102. + + """ + ) + raise ImportError(msg) + try: + return globals()[name] + except KeyError: + # This is turned into the appropriate ImportError + raise AttributeError diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index a9a5f3b39c3ec..baecc81cd3f9b 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -21,7 +21,6 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_pandas_support -from ..utils.deprecation import deprecated import numpy as np @@ -1170,193 +1169,6 @@ def load_linnerud(*, return_X_y=False, as_frame=False): ) -@deprecated( - r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2. - - The Boston housing prices dataset has an ethical problem. You can refer to - the documentation of this function for further details. - - The scikit-learn maintainers therefore strongly discourage the use of this - dataset unless the purpose of the code is to study and educate about - ethical issues in data science and machine learning. - - In this special case, you can fetch the dataset from the original - source:: - - import pandas as pd - import numpy as np - - data_url = "http://lib.stat.cmu.edu/datasets/boston" - raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) - data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) - target = raw_df.values[1::2, 2] - - Alternative datasets include the California housing dataset (i.e. - :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing - dataset. You can load the datasets as follows:: - - from sklearn.datasets import fetch_california_housing - housing = fetch_california_housing() - - for the California housing dataset and:: - - from sklearn.datasets import fetch_openml - housing = fetch_openml(name="house_prices", as_frame=True) - - for the Ames housing dataset.""" -) -def load_boston(*, return_X_y=False): - r"""Load and return the Boston house-prices dataset (regression). - - ============== ============== - Samples total 506 - Dimensionality 13 - Features real, positive - Targets real 5. - 50. - ============== ============== - - Read more in the :ref:`User Guide `. - - .. warning:: - The Boston housing prices dataset has an ethical problem: as - investigated in [1]_, the authors of this dataset engineered a - non-invertible variable "B" assuming that racial self-segregation had a - positive impact on house prices [2]_. Furthermore the goal of the - research that led to the creation of this dataset was to study the - impact of air quality but it did not give adequate demonstration of the - validity of this assumption. - - The scikit-learn maintainers therefore strongly discourage the use of - this dataset unless the purpose of the code is to study and educate - about ethical issues in data science and machine learning. - - In this special case, you can fetch the dataset from the original - source:: - - import pandas as pd # doctest: +SKIP - import numpy as np - - data_url = "http://lib.stat.cmu.edu/datasets/boston" - raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) - data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) - target = raw_df.values[1::2, 2] - - Alternative datasets include the California housing dataset [3]_ - (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames - housing dataset [4]_. You can load the datasets as follows:: - - from sklearn.datasets import fetch_california_housing - housing = fetch_california_housing() - - for the California housing dataset and:: - - from sklearn.datasets import fetch_openml - housing = fetch_openml(name="house_prices", as_frame=True) - - for the Ames housing dataset. - - Parameters - ---------- - return_X_y : bool, default=False - If True, returns ``(data, target)`` instead of a Bunch object. - See below for more information about the `data` and `target` object. - - .. versionadded:: 0.18 - - Returns - ------- - data : :class:`~sklearn.utils.Bunch` - Dictionary-like object, with the following attributes. - - data : ndarray of shape (506, 13) - The data matrix. - target : ndarray of shape (506,) - The regression target. - filename : str - The physical location of boston csv dataset. - - .. versionadded:: 0.20 - - DESCR : str - The full description of the dataset. - feature_names : ndarray - The names of features - - (data, target) : tuple if ``return_X_y`` is True - A tuple of two ndarrays. The first contains a 2D array of shape (506, 13) - with each row representing one sample and each column representing the features. - The second array of shape (506,) contains the target samples. - - .. versionadded:: 0.18 - - Notes - ----- - .. versionchanged:: 0.20 - Fixed a wrong data point at [445, 0]. - - References - ---------- - .. [1] `Racist data destruction? M Carlisle, - `_ - .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld. - "Hedonic housing prices and the demand for clean air." - Journal of environmental economics and management 5.1 (1978): 81-102. - `_ - .. [3] `California housing dataset - `_ - .. [4] `Ames housing dataset - `_ - - Examples - -------- - >>> import warnings - >>> from sklearn.datasets import load_boston - >>> with warnings.catch_warnings(): - ... # You should probably not use this dataset. - ... warnings.filterwarnings("ignore") - ... X, y = load_boston(return_X_y=True) - >>> print(X.shape) - (506, 13) - """ - # TODO: once the deprecation period is over, implement a module level - # `__getattr__` function in`sklearn.datasets` to raise an exception with - # an informative error message at import time instead of just removing - # load_boston. The goal is to avoid having beginners that copy-paste code - # from numerous books and tutorials that use this dataset loader get - # a confusing ImportError when trying to learn scikit-learn. - # See: https://www.python.org/dev/peps/pep-0562/ - - descr_text = load_descr("boston_house_prices.rst") - - data_file_name = "boston_house_prices.csv" - with resources.open_text(DATA_MODULE, data_file_name) as f: - data_file = csv.reader(f) - temp = next(data_file) - n_samples = int(temp[0]) - n_features = int(temp[1]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,)) - temp = next(data_file) # names of features - feature_names = np.array(temp) - - for i, d in enumerate(data_file): - data[i] = np.asarray(d[:-1], dtype=np.float64) - target[i] = np.asarray(d[-1], dtype=np.float64) - - if return_X_y: - return data, target - - return Bunch( - data=data, - target=target, - # last column is target value - feature_names=feature_names[:-1], - DESCR=descr_text, - filename=data_file_name, - data_module=DATA_MODULE, - ) - - def load_sample_images(): """Load sample images for image manipulation. diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 2eeb2fc570094..4675a96df56c0 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -19,7 +19,6 @@ from sklearn.datasets import load_linnerud from sklearn.datasets import load_iris from sklearn.datasets import load_breast_cancer -from sklearn.datasets import load_boston from sklearn.datasets import load_wine from sklearn.datasets._base import ( load_csv_data, @@ -27,7 +26,6 @@ ) from sklearn.preprocessing import scale from sklearn.utils import Bunch -from sklearn.utils._testing import SkipTest from sklearn.datasets.tests.test_common import check_as_frame @@ -245,7 +243,6 @@ def test_load_diabetes_raw(): ) -@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated") @pytest.mark.parametrize( "loader_func, data_shape, target_shape, n_target, has_descr, filenames", [ @@ -263,7 +260,6 @@ def test_load_diabetes_raw(): (load_diabetes, (442, 10), (442,), None, True, []), (load_digits, (1797, 64), (1797,), 10, True, []), (partial(load_digits, n_class=9), (1617, 64), (1617,), 10, True, []), - (load_boston, (506, 13), (506,), None, True, ["filename"]), ], ) def test_loader(loader_func, data_shape, target_shape, n_target, has_descr, filenames): @@ -343,31 +339,13 @@ def test_bunch_dir(): assert "data" in dir(data) -# FIXME: to be removed in 1.2 -def test_load_boston_warning(): - """Check that we raise the ethical warning when loading `load_boston`.""" - warn_msg = "The Boston housing prices dataset has an ethical problem" - with pytest.warns(FutureWarning, match=warn_msg): - load_boston() +def test_load_boston_error(): + """Check that we raise the ethical warning when trying to import `load_boston`.""" + msg = "The Boston housing prices dataset has an ethical problem" + with pytest.raises(ImportError, match=msg): + from sklearn.datasets import load_boston # noqa - -@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated") -def test_load_boston_alternative(): - pd = pytest.importorskip("pandas") - if os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "1": - raise SkipTest( - "This test requires an internet connection to fetch the dataset." - ) - - boston_sklearn = load_boston() - - data_url = "http://lib.stat.cmu.edu/datasets/boston" - try: - raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None) - except ConnectionError as e: - pytest.xfail(f"The dataset can't be downloaded. Got exception: {e}") - data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) - target = raw_df.values[1::2, 2] - - np.testing.assert_allclose(data, boston_sklearn.data) - np.testing.assert_allclose(target, boston_sklearn.target) + # other non-existing function should raise the usual import error + msg = "cannot import name 'non_existing_function' from 'sklearn.datasets'" + with pytest.raises(ImportError, match=msg): + from sklearn.datasets import non_existing_function # noqa