Thanks to visit codestin.com
Credit goes to github.com

Skip to content

MAINT Clean deprecation for 1.2: load_boston #24603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions sklearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
including methods to load and fetch popular reference datasets. It also
features some artificial data generators.
"""
import textwrap

from ._base import load_breast_cancer
from ._base import load_boston
from ._base import load_diabetes
from ._base import load_digits
from ._base import load_files
Expand Down Expand Up @@ -66,7 +67,6 @@
"fetch_kddcup99",
"fetch_openml",
"get_data_home",
"load_boston",
"load_diabetes",
"load_digits",
"load_files",
Expand Down Expand Up @@ -99,3 +99,63 @@
"make_spd_matrix",
"make_swiss_roll",
]


def __getattr__(name):
if name == "load_boston":
msg = textwrap.dedent(
"""
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

for the California housing dataset and::

from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>
"""
)
raise ImportError(msg)
try:
return globals()[name]
except KeyError:
# This is turned into the appropriate ImportError
raise AttributeError
188 changes: 0 additions & 188 deletions sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from ..utils import Bunch
from ..utils import check_random_state
from ..utils import check_pandas_support
from ..utils.deprecation import deprecated

import numpy as np

Expand Down Expand Up @@ -1170,193 +1169,6 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
)


@deprecated(
r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2.

The Boston housing prices dataset has an ethical problem. You can refer to
the documentation of this function for further details.

The scikit-learn maintainers therefore strongly discourage the use of this
dataset unless the purpose of the code is to study and educate about
ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset (i.e.
:func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
dataset. You can load the datasets as follows::

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

for the California housing dataset and::

from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset."""
)
def load_boston(*, return_X_y=False):
r"""Load and return the Boston house-prices dataset (regression).

============== ==============
Samples total 506
Dimensionality 13
Features real, positive
Targets real 5. - 50.
============== ==============

Read more in the :ref:`User Guide <boston_dataset>`.

.. warning::
The Boston housing prices dataset has an ethical problem: as
investigated in [1]_, the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]_. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

import pandas as pd # doctest: +SKIP
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset [3]_
(i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames
housing dataset [4]_. You can load the datasets as follows::

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

for the California housing dataset and::

from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

Parameters
----------
return_X_y : bool, default=False
If True, returns ``(data, target)`` instead of a Bunch object.
See below for more information about the `data` and `target` object.

.. versionadded:: 0.18

Returns
-------
data : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.

data : ndarray of shape (506, 13)
The data matrix.
target : ndarray of shape (506,)
The regression target.
filename : str
The physical location of boston csv dataset.

.. versionadded:: 0.20

DESCR : str
The full description of the dataset.
feature_names : ndarray
The names of features

(data, target) : tuple if ``return_X_y`` is True
A tuple of two ndarrays. The first contains a 2D array of shape (506, 13)
with each row representing one sample and each column representing the features.
The second array of shape (506,) contains the target samples.

.. versionadded:: 0.18

Notes
-----
.. versionchanged:: 0.20
Fixed a wrong data point at [445, 0].

References
----------
.. [1] `Racist data destruction? M Carlisle,
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>`_
.. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>`_
.. [3] `California housing dataset
<https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset>`_
.. [4] `Ames housing dataset
<https://www.openml.org/d/42165>`_

Examples
--------
>>> import warnings
>>> from sklearn.datasets import load_boston
>>> with warnings.catch_warnings():
... # You should probably not use this dataset.
... warnings.filterwarnings("ignore")
... X, y = load_boston(return_X_y=True)
>>> print(X.shape)
(506, 13)
"""
# TODO: once the deprecation period is over, implement a module level
# `__getattr__` function in`sklearn.datasets` to raise an exception with
# an informative error message at import time instead of just removing
# load_boston. The goal is to avoid having beginners that copy-paste code
# from numerous books and tutorials that use this dataset loader get
# a confusing ImportError when trying to learn scikit-learn.
# See: https://www.python.org/dev/peps/pep-0562/

descr_text = load_descr("boston_house_prices.rst")

data_file_name = "boston_house_prices.csv"
with resources.open_text(DATA_MODULE, data_file_name) as f:
data_file = csv.reader(f)
temp = next(data_file)
n_samples = int(temp[0])
n_features = int(temp[1])
data = np.empty((n_samples, n_features))
target = np.empty((n_samples,))
temp = next(data_file) # names of features
feature_names = np.array(temp)

for i, d in enumerate(data_file):
data[i] = np.asarray(d[:-1], dtype=np.float64)
target[i] = np.asarray(d[-1], dtype=np.float64)

if return_X_y:
return data, target

return Bunch(
data=data,
target=target,
# last column is target value
feature_names=feature_names[:-1],
DESCR=descr_text,
filename=data_file_name,
data_module=DATA_MODULE,
)


def load_sample_images():
"""Load sample images for image manipulation.

Expand Down
40 changes: 9 additions & 31 deletions sklearn/datasets/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,13 @@
from sklearn.datasets import load_linnerud
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_boston
from sklearn.datasets import load_wine
from sklearn.datasets._base import (
load_csv_data,
load_gzip_compressed_csv_data,
)
from sklearn.preprocessing import scale
from sklearn.utils import Bunch
from sklearn.utils._testing import SkipTest
from sklearn.datasets.tests.test_common import check_as_frame


Expand Down Expand Up @@ -245,7 +243,6 @@ def test_load_diabetes_raw():
)


@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated")
@pytest.mark.parametrize(
"loader_func, data_shape, target_shape, n_target, has_descr, filenames",
[
Expand All @@ -263,7 +260,6 @@ def test_load_diabetes_raw():
(load_diabetes, (442, 10), (442,), None, True, []),
(load_digits, (1797, 64), (1797,), 10, True, []),
(partial(load_digits, n_class=9), (1617, 64), (1617,), 10, True, []),
(load_boston, (506, 13), (506,), None, True, ["filename"]),
],
)
def test_loader(loader_func, data_shape, target_shape, n_target, has_descr, filenames):
Expand Down Expand Up @@ -343,31 +339,13 @@ def test_bunch_dir():
assert "data" in dir(data)


# FIXME: to be removed in 1.2
def test_load_boston_warning():
"""Check that we raise the ethical warning when loading `load_boston`."""
warn_msg = "The Boston housing prices dataset has an ethical problem"
with pytest.warns(FutureWarning, match=warn_msg):
load_boston()
def test_load_boston_error():
"""Check that we raise the ethical warning when trying to import `load_boston`."""
msg = "The Boston housing prices dataset has an ethical problem"
with pytest.raises(ImportError, match=msg):
from sklearn.datasets import load_boston # noqa


@pytest.mark.filterwarnings("ignore:Function load_boston is deprecated")
def test_load_boston_alternative():
pd = pytest.importorskip("pandas")
if os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "1":
raise SkipTest(
"This test requires an internet connection to fetch the dataset."
)

boston_sklearn = load_boston()

data_url = "http://lib.stat.cmu.edu/datasets/boston"
try:
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
except ConnectionError as e:
pytest.xfail(f"The dataset can't be downloaded. Got exception: {e}")
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

np.testing.assert_allclose(data, boston_sklearn.data)
np.testing.assert_allclose(target, boston_sklearn.target)
# other non-existing function should raise the usual import error
msg = "cannot import name 'non_existing_function' from 'sklearn.datasets'"
with pytest.raises(ImportError, match=msg):
from sklearn.datasets import non_existing_function # noqa