From 903887cf2e84d38456433a11f34b21b09908e878 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Wed, 20 Jan 2016 01:21:20 +0530 Subject: [PATCH 01/11] Add smaple_weight to median_absolute_error --- sklearn/metrics/regression.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index af3a02d6f33f9..d5d08b251bc5b 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -26,6 +26,7 @@ from ..utils.validation import check_array, check_consistent_length from ..utils.validation import column_or_1d +from ..utils.stats import _weighted_percentile from ..externals.six import string_types import warnings @@ -241,7 +242,7 @@ def mean_squared_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) -def median_absolute_error(y_true, y_pred): +def median_absolute_error(y_true, y_pred, sample_weight=None): """Median absolute error regression loss Read more in the :ref:`User Guide `. @@ -254,6 +255,9 @@ def median_absolute_error(y_true, y_pred): y_pred : array-like of shape = (n_samples) Estimated target values. + sample_weight : array-like of shape = (n_samples), optional + Sample weights. + Returns ------- loss : float @@ -272,7 +276,17 @@ def median_absolute_error(y_true, y_pred): 'uniform_average') if y_type == 'continuous-multioutput': raise ValueError("Multioutput not supported in median_absolute_error") - return np.median(np.abs(y_pred - y_true)) + + if sample_weight is None: + return np.median(np.abs(y_pred - y_true)) + else: + check_consistent_length(y_pred, sample_weight) + sample_weight = np.array(sample_weight) + y_pred = y_pred.ravel() + y_true = y_true.ravel() + print sample_weight.shape + return _weighted_percentile(np.abs(y_pred - y_true), + np.asarray(sample_weight)) def explained_variance_score(y_true, y_pred, From ce21b98adbcb3678fe345bcb12aaf1a1330fd500 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sat, 23 Jan 2016 22:12:52 +0530 Subject: [PATCH 02/11] ENH: Add sample_weight to median_absolute_error Also make the _weighted_percentile more strong in utils --- sklearn/metrics/regression.py | 1 - sklearn/utils/stats.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index d5d08b251bc5b..eae71dca8b588 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -284,7 +284,6 @@ def median_absolute_error(y_true, y_pred, sample_weight=None): sample_weight = np.array(sample_weight) y_pred = y_pred.ravel() y_true = y_true.ravel() - print sample_weight.shape return _weighted_percentile(np.abs(y_pred - y_true), np.asarray(sample_weight)) diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py index 463146d038c6b..aca5072f79d40 100644 --- a/sklearn/utils/stats.py +++ b/sklearn/utils/stats.py @@ -51,9 +51,12 @@ def _rankdata(a, method="average"): def _weighted_percentile(array, sample_weight, percentile=50): """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """ sorted_idx = np.argsort(array) - + sample_weight = np.array(sample_weight) + midpoint = sum(sample_weight)/2.0 # Find index of median prediction for each sample weight_cdf = sample_weight[sorted_idx].cumsum() percentile_idx = np.searchsorted( weight_cdf, (percentile / 100.) * weight_cdf[-1]) + if weight_cdf[percentile_idx] == midpoint: + return np.mean(array[sorted_idx[percentile_idx]:sorted_idx[percentile_idx+1]+1]) return array[sorted_idx[percentile_idx]] From 35b5f066149d463340b8486f3f91ca23c9e629b9 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Sun, 24 Jan 2016 23:50:57 +0530 Subject: [PATCH 03/11] Revert changes to _weighted_percentile in the utils Use linear interpolation to calculate weighted median --- sklearn/metrics/regression.py | 11 ++++++++++- sklearn/utils/stats.py | 4 ---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index eae71dca8b588..704c5966d7e44 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -26,7 +26,6 @@ from ..utils.validation import check_array, check_consistent_length from ..utils.validation import column_or_1d -from ..utils.stats import _weighted_percentile from ..externals.six import string_types import warnings @@ -242,6 +241,16 @@ def mean_squared_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) +def _weighted_percentile(array, sample_weight, percentile=50): + sorted_idx = np.argsort(array) + sample_weight = np.array(sample_weight) + weight_cdf = sample_weight[sorted_idx].cumsum() + weighted_percentile = (weight_cdf - sample_weight/2.0) / weight_cdf[-1] + sorted_array = np.sort(array) + weighted_median = np.interp(percentile/100., weighted_percentile, sorted_array) + return weighted_median + + def median_absolute_error(y_true, y_pred, sample_weight=None): """Median absolute error regression loss diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py index aca5072f79d40..7d14ac7933a2b 100644 --- a/sklearn/utils/stats.py +++ b/sklearn/utils/stats.py @@ -51,12 +51,8 @@ def _rankdata(a, method="average"): def _weighted_percentile(array, sample_weight, percentile=50): """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """ sorted_idx = np.argsort(array) - sample_weight = np.array(sample_weight) - midpoint = sum(sample_weight)/2.0 # Find index of median prediction for each sample weight_cdf = sample_weight[sorted_idx].cumsum() percentile_idx = np.searchsorted( weight_cdf, (percentile / 100.) * weight_cdf[-1]) - if weight_cdf[percentile_idx] == midpoint: - return np.mean(array[sorted_idx[percentile_idx]:sorted_idx[percentile_idx+1]+1]) return array[sorted_idx[percentile_idx]] From d0084764b90484854d7549b8d172979bd5d58ebb Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 2 Feb 2016 19:55:37 +0530 Subject: [PATCH 04/11] Some more changes and tests --- sklearn/metrics/regression.py | 14 ++------------ sklearn/metrics/tests/test_common.py | 11 ++++++----- sklearn/metrics/tests/test_regression.py | 21 +++++++++++++++++++++ sklearn/utils/extmath.py | 10 ++++++++++ 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 704c5966d7e44..8f8a2962ca3d8 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -26,6 +26,7 @@ from ..utils.validation import check_array, check_consistent_length from ..utils.validation import column_or_1d +from ..utils.extmath import weighted_median from ..externals.six import string_types import warnings @@ -241,16 +242,6 @@ def mean_squared_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) -def _weighted_percentile(array, sample_weight, percentile=50): - sorted_idx = np.argsort(array) - sample_weight = np.array(sample_weight) - weight_cdf = sample_weight[sorted_idx].cumsum() - weighted_percentile = (weight_cdf - sample_weight/2.0) / weight_cdf[-1] - sorted_array = np.sort(array) - weighted_median = np.interp(percentile/100., weighted_percentile, sorted_array) - return weighted_median - - def median_absolute_error(y_true, y_pred, sample_weight=None): """Median absolute error regression loss @@ -285,7 +276,6 @@ def median_absolute_error(y_true, y_pred, sample_weight=None): 'uniform_average') if y_type == 'continuous-multioutput': raise ValueError("Multioutput not supported in median_absolute_error") - if sample_weight is None: return np.median(np.abs(y_pred - y_true)) else: @@ -293,7 +283,7 @@ def median_absolute_error(y_true, y_pred, sample_weight=None): sample_weight = np.array(sample_weight) y_pred = y_pred.ravel() y_true = y_true.ravel() - return _weighted_percentile(np.abs(y_pred - y_true), + return weighted_median(np.abs(y_pred - y_true), np.asarray(sample_weight)) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index fa4c7e8d3124b..edcd5218f264b 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -374,7 +374,6 @@ # matrix instead of a number. Testing of # confusion_matrix with sample_weight is in # test_classification.py - "median_absolute_error", ] @@ -956,10 +955,12 @@ def check_sample_weight_invariance(name, metric, y1, y2): # check that the weighted and unweighted scores are unequal weighted_score = metric(y1, y2, sample_weight=sample_weight) - assert_not_equal( - unweighted_score, weighted_score, - msg="Unweighted and weighted scores are unexpectedly " - "equal (%f) for %s" % (weighted_score, name)) + if name != "median_absolute_error": + # unweighted and weighted give same value for this metric. see #6217 + assert_not_equal( + unweighted_score, weighted_score, + msg="Unweighted and weighted scores are unexpectedly " + "equal (%f) for %s" % (weighted_score, name)) # check that sample_weight can be a list weighted_score_list = metric(y1, y2, diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 600bcc135a202..2a442f2793642 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -5,6 +5,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_not_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal @@ -141,3 +142,23 @@ def test_regression_custom_weights(): assert_almost_equal(maew, 0.475, decimal=3) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) + +def test_median_absolute_error_weights(): + y_tr = [3, -0.5, 2, 7] + y_pr = [2.5, 0.0, 2, 8] + sample_weight = [1, 2, 3, 4] + # check that unit weights gives the same score as no weight + unweighted_score = median_absolute_error(y_tr, y_pr, sample_weight=None) + assert_almost_equal( + unweighted_score, median_absolute_error(y_tr, y_pr, + sample_weight=np.ones(shape=len(y_tr))), + err_msg="For median_absolute_error sample_weight=None is not " + "equivalent to sample_weight=ones" ) + + # check that the weighted and unweighted scores are unequal + weighted_score = median_absolute_error(y_tr, y_pr, + sample_weight=sample_weight) + assert_not_equal( + unweighted_score, weighted_score, + msg="Unweighted and weighted scores are unexpectedly " + "equal (%f) for median_absolute_error" % weighted_score) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 1857a27adfadc..5705e0e1b4803 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -862,3 +862,13 @@ def stable_cumsum(arr, rtol=1e-05, atol=1e-08): raise RuntimeError('cumsum was found to be unstable: ' 'its last element does not correspond to sum') return out + + +def weighted_median(array, sample_weight): + sorted_idx = np.argsort(array) + sample_weight = np.asarray(sample_weight) + weight_cdf = sample_weight[sorted_idx].cumsum() + weighted_percentile = (weight_cdf - sample_weight/2.0) / weight_cdf[-1] + sorted_array = array[sorted_idx] + weighted_median = np.interp(0.5, weighted_percentile, sorted_array) + return weighted_median From e0b2f909ca7cbf5cd920414bc6b701ec470c6347 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 9 Feb 2016 23:42:23 +0530 Subject: [PATCH 05/11] Some more changes to the failing tests --- sklearn/metrics/tests/test_score_objects.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index c3643e81031a5..033791018cbf1 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -384,10 +384,11 @@ def test_scorer_sample_weight(): sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) - assert_not_equal(weighted, unweighted, - msg="scorer {0} behaves identically when " - "called with sample weights: {1} vs " - "{2}".format(name, weighted, unweighted)) + if name != "median_absolute_error": + assert_not_equal(weighted, unweighted, + msg="scorer {0} behaves identically when " + "called with sample weights: {1} vs " + "{2}".format(name, weighted, unweighted)) assert_almost_equal(weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" From 3c7d509d8ae4c38d94befa899728ab12bfb69d36 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 15 Mar 2016 20:17:17 +0530 Subject: [PATCH 06/11] Fix a bug in weighted median implementation --- sklearn/metrics/regression.py | 11 +++++------ sklearn/metrics/tests/test_regression.py | 2 +- sklearn/utils/extmath.py | 5 +++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 8f8a2962ca3d8..880403623dae7 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -276,15 +276,14 @@ def median_absolute_error(y_true, y_pred, sample_weight=None): 'uniform_average') if y_type == 'continuous-multioutput': raise ValueError("Multioutput not supported in median_absolute_error") + y_pred = y_pred.ravel() + y_true = y_true.ravel() if sample_weight is None: - return np.median(np.abs(y_pred - y_true)) + sample_weight = np.ones_like(y_true) else: check_consistent_length(y_pred, sample_weight) - sample_weight = np.array(sample_weight) - y_pred = y_pred.ravel() - y_true = y_true.ravel() - return weighted_median(np.abs(y_pred - y_true), - np.asarray(sample_weight)) + sample_weight = np.asarray(sample_weight) + return weighted_median(np.abs(y_pred - y_true), sample_weight) def explained_variance_score(y_true, y_pred, diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 2a442f2793642..272cacb2fed95 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -146,7 +146,7 @@ def test_regression_custom_weights(): def test_median_absolute_error_weights(): y_tr = [3, -0.5, 2, 7] y_pr = [2.5, 0.0, 2, 8] - sample_weight = [1, 2, 3, 4] + sample_weight = [2, 3, 1, 4] # check that unit weights gives the same score as no weight unweighted_score = median_absolute_error(y_tr, y_pr, sample_weight=None) assert_almost_equal( diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 5705e0e1b4803..6c9a66ec823cf 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -867,8 +867,9 @@ def stable_cumsum(arr, rtol=1e-05, atol=1e-08): def weighted_median(array, sample_weight): sorted_idx = np.argsort(array) sample_weight = np.asarray(sample_weight) - weight_cdf = sample_weight[sorted_idx].cumsum() - weighted_percentile = (weight_cdf - sample_weight/2.0) / weight_cdf[-1] + sorted_sample_weight = sample_weight[sorted_idx] + weight_cdf = sorted_sample_weight.cumsum() + weighted_percentile = (weight_cdf - sorted_sample_weight/2.0) / weight_cdf[-1] sorted_array = array[sorted_idx] weighted_median = np.interp(0.5, weighted_percentile, sorted_array) return weighted_median From c56ecdc6b50b5699b963a10f4ea6da7ac19b4c94 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Tue, 22 Mar 2016 19:00:48 +0530 Subject: [PATCH 07/11] Add documentation to weighted_median --- sklearn/utils/extmath.py | 18 ++++++++++++++++++ sklearn/utils/tests/test_extmath.py | 11 ++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 6c9a66ec823cf..f8757ce0ef9ad 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -865,6 +865,24 @@ def stable_cumsum(arr, rtol=1e-05, atol=1e-08): def weighted_median(array, sample_weight): + """Compute the weighted median of the array with sample weight + + Parameters + ---------- + array : array_like + n-dimensional array of which to find weighted median. + sample_weight : array_like + n-dimensional array of weights for each value + + Example + ------- + >>> from sklearn.utils.extmath import weighted_median + >>> import numpy as np + >>> weighted_median(np.array([1,2,3,4]),np.array([1,1,1,1])) + 2.5 + >>> weighted_median(np.array([1,2,3]),np.array([1,1,1])) + 2.0 + """ sorted_idx = np.argsort(array) sample_weight = np.asarray(sample_weight) sorted_sample_weight = sample_weight[sorted_idx] diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 55f96cdf1574c..93a7e0c60a685 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -27,7 +27,7 @@ from sklearn.utils.extmath import norm, squared_norm from sklearn.utils.extmath import randomized_svd from sklearn.utils.extmath import row_norms -from sklearn.utils.extmath import weighted_mode +from sklearn.utils.extmath import weighted_mode, weighted_median from sklearn.utils.extmath import cartesian from sklearn.utils.extmath import log_logistic from sklearn.utils.extmath import fast_dot, _fast_dot @@ -658,3 +658,12 @@ def test_stable_cumsum(): 'cumsum was found to be unstable: its last element ' 'does not correspond to sum', stable_cumsum, r, rtol=0, atol=0) + + +def test_weighted_median(): + rng = np.random.RandomState(0) + x = rng.randint(10, size=(10,)) + weights = np.ones(x.shape) + median = np.median(x) + wmedian = weighted_median(x, weights) + assert_almost_equal(median, wmedian) From 006d86e5de5c1cc98cb8f76d09d02abbbf387341 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 24 Mar 2016 16:54:50 +0530 Subject: [PATCH 08/11] Add documentation to weighted_median --- doc/developers/utilities.rst | 3 +++ sklearn/utils/extmath.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst index 8dbe460635926..895c4b8d23506 100644 --- a/doc/developers/utilities.rst +++ b/doc/developers/utilities.rst @@ -122,6 +122,9 @@ Efficient Linear Algebra & Array Operations - :func:`shuffle`: Shuffle arrays or sparse matrices in a consistent way. Used in ``sklearn.cluster.k_means``. +- :func:`extmath.weighted_median`: an implementation to get weighted median + of the array using sample weights. + Efficient Random Sampling ========================= diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index f8757ce0ef9ad..0b302042d8676 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -874,6 +874,10 @@ def weighted_median(array, sample_weight): sample_weight : array_like n-dimensional array of weights for each value + Returns + ------- + weighted_median : float + Weighted median of the array Example ------- >>> from sklearn.utils.extmath import weighted_median @@ -882,6 +886,11 @@ def weighted_median(array, sample_weight): 2.5 >>> weighted_median(np.array([1,2,3]),np.array([1,1,1])) 2.0 + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Percentile + """ sorted_idx = np.argsort(array) sample_weight = np.asarray(sample_weight) From 4f61f59d1c0de870dbe72c62d5710ff846630003 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 15 Sep 2016 13:13:21 +0530 Subject: [PATCH 09/11] Add more tests --- sklearn/utils/tests/test_extmath.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 93a7e0c60a685..1ecc1809a8020 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -667,3 +667,18 @@ def test_weighted_median(): median = np.median(x) wmedian = weighted_median(x, weights) assert_almost_equal(median, wmedian) + + +def test_weighted_median_equal_split(): + rng = np.random.RandomState(0) + weights_left = rng.multinomial(20, [1/5.]*5, size=1)[0] + weights_right = rng.multinomial(20, [1/5.]*5, size=1)[0] + x = np.asarray(range(20)) + rng.shuffle(x) + x = x[10:] + x.sort() + weights = np.hstack((weights_left, weights_right)) + wmedian = weighted_median(x, weights) + sum_left = np.sum(weights[np.where(x < wmedian)]) + sum_right = np.sum(weights[np.where(x > wmedian)]) + assert_equal(sum_left, sum_right) From e192027370841dd09cca7148c476cfd09bae12f0 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 15 Sep 2016 13:44:38 +0530 Subject: [PATCH 10/11] fix test --- sklearn/metrics/tests/test_score_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 033791018cbf1..ce2ccb39b6e55 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -384,7 +384,7 @@ def test_scorer_sample_weight(): sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) - if name != "median_absolute_error": + if "median_absolute_error" not in name: assert_not_equal(weighted, unweighted, msg="scorer {0} behaves identically when " "called with sample weights: {1} vs " From 7a8611f67ffc8c316fa297827eefe71681044770 Mon Sep 17 00:00:00 2001 From: Maniteja Nandana Date: Thu, 15 Sep 2016 14:46:18 +0530 Subject: [PATCH 11/11] solve flake8 checks --- sklearn/metrics/tests/test_common.py | 2 +- sklearn/metrics/tests/test_regression.py | 22 +++++++++++----------- sklearn/utils/extmath.py | 3 ++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index edcd5218f264b..c7fe05e615f38 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -956,7 +956,7 @@ def check_sample_weight_invariance(name, metric, y1, y2): # check that the weighted and unweighted scores are unequal weighted_score = metric(y1, y2, sample_weight=sample_weight) if name != "median_absolute_error": - # unweighted and weighted give same value for this metric. see #6217 + # unweighted and weighted give same value for this metric. see #6217 assert_not_equal( unweighted_score, weighted_score, msg="Unweighted and weighted scores are unexpectedly " diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 272cacb2fed95..6eec266e579cc 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -142,23 +142,23 @@ def test_regression_custom_weights(): assert_almost_equal(maew, 0.475, decimal=3) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) - + + def test_median_absolute_error_weights(): y_tr = [3, -0.5, 2, 7] y_pr = [2.5, 0.0, 2, 8] sample_weight = [2, 3, 1, 4] # check that unit weights gives the same score as no weight unweighted_score = median_absolute_error(y_tr, y_pr, sample_weight=None) - assert_almost_equal( - unweighted_score, median_absolute_error(y_tr, y_pr, - sample_weight=np.ones(shape=len(y_tr))), - err_msg="For median_absolute_error sample_weight=None is not " - "equivalent to sample_weight=ones" ) + weighted_score = median_absolute_error(y_tr, y_pr, + sample_weight=np.ones(len(y_tr))) + assert_almost_equal(unweighted_score, weighted_score, + err_msg="For median_absolute_error sample_weight=None" + "is not equivalent to sample_weight=ones") # check that the weighted and unweighted scores are unequal weighted_score = median_absolute_error(y_tr, y_pr, - sample_weight=sample_weight) - assert_not_equal( - unweighted_score, weighted_score, - msg="Unweighted and weighted scores are unexpectedly " - "equal (%f) for median_absolute_error" % weighted_score) + sample_weight=sample_weight) + assert_not_equal(unweighted_score, weighted_score, + msg="Unweighted and weighted scores are unexpectedly " + "equal (%f) for median_absolute_error" % weighted_score) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 0b302042d8676..d9ed0c669ea44 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -896,7 +896,8 @@ def weighted_median(array, sample_weight): sample_weight = np.asarray(sample_weight) sorted_sample_weight = sample_weight[sorted_idx] weight_cdf = sorted_sample_weight.cumsum() - weighted_percentile = (weight_cdf - sorted_sample_weight/2.0) / weight_cdf[-1] + weighted_percentile = weight_cdf - sorted_sample_weight/2.0 + weighted_percentile /= weight_cdf[-1] sorted_array = array[sorted_idx] weighted_median = np.interp(0.5, weighted_percentile, sorted_array) return weighted_median