From eb70678dd96eff3e115d8c2d098c78d102e58b92 Mon Sep 17 00:00:00 2001 From: ashutosh_hathidara Date: Wed, 18 Sep 2019 11:27:43 +0530 Subject: [PATCH 001/103] Added mean_absolute_percentage_error in metrics --- sklearn/metrics/__init__.py | 2 + sklearn/metrics/regression.py | 71 ++++++++++++++++++++++++ sklearn/metrics/tests/test_regression.py | 12 ++++ 3 files changed, 85 insertions(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index b0846f2ff6828..a06f189422655 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -66,6 +66,7 @@ from .regression import mean_squared_error from .regression import mean_squared_log_error from .regression import median_absolute_error +from .regression import mean_absolute_percentage_error from .regression import r2_score from .regression import mean_tweedie_deviance from .regression import mean_poisson_deviance @@ -126,6 +127,7 @@ 'mean_gamma_deviance', 'mean_tweedie_deviance', 'median_absolute_error', + 'mean_absolute_percentage_error', 'multilabel_confusion_matrix', 'mutual_info_score', 'ndcg_score', diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 67a5cdd590e30..ecc2821449bf0 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -189,6 +189,77 @@ def mean_absolute_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) +def mean_absolute_percentage_error(y_true, y_pred, + sample_weight=None, + multioutput='uniform_average'): + """Mean absolute percentage error regression loss + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape = (n_samples) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape = (n_samples), optional + Sample weights. + + multioutput : string in ['raw_values', 'uniform_average'] + or array-like of shape (n_outputs) + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + + Returns + ------- + loss : float or ndarray of floats + If multioutput is 'raw_values', then mean absolute percentage error is returned + for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + MAPE output is non-negative floating point. The best value is 0.0. + + Examples + -------- + >>> from sklearn.metrics import mean_absolute_percentage_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> mean_absolute_percentage_error(y_true, y_pred) + 14.58... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> mean_absolute_percentage_error(y_true, y_pred) + 26.68... + >>> mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') + array([15.27777778, 38.0952381 ]) + >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 31.24... + """ + y_type, y_true, y_pred, multioutput = _check_reg_targets( + y_true, y_pred, multioutput) + check_consistent_length(y_true, y_pred, sample_weight) + output_errors = np.average(np.abs((y_pred - y_true) / (1 + np.abs(y_true))), + weights=sample_weight, axis=0) * 100.0 + if isinstance(multioutput, str): + if multioutput == 'raw_values': + return output_errors + elif multioutput == 'uniform_average': + # pass None as weights to np.average: uniform mean + multioutput = None + + return np.average(output_errors, weights=multioutput) + + def mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=True): diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index b6ce1434d6861..ade9e23225a99 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -13,6 +13,7 @@ from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_log_error from sklearn.metrics import median_absolute_error +from sklearn.metrics import mean_absolute_percentage_error from sklearn.metrics import max_error from sklearn.metrics import r2_score from sklearn.metrics import mean_tweedie_deviance @@ -32,6 +33,8 @@ def test_regression_metrics(n_samples=50): np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) + assert_almost_equal(mean_absolute_percentage_error(y_true,y_pred), + 8.99, decimal=2) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) @@ -74,6 +77,9 @@ def test_multioutput_regression(): error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) + error = mean_absolute_percentage_error(y_true, y_pred) + assert_almost_equal(error, 24.99, decimal=2) + error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) error = r2_score(y_true, y_pred, multioutput='uniform_average') @@ -85,6 +91,7 @@ def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.00, 2) assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) + assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.00, 2) assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(max_error([0.], [0.]), 0.00, 2) assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) @@ -183,11 +190,13 @@ def test_regression_multioutput_array(): mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') + mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) + assert_array_almost_equal(mape, [ 5.844, 16.145], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) @@ -239,12 +248,15 @@ def test_regression_custom_weights(): rmsew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6], squared=False) maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6]) + mapew = mean_absolute_percentage_error(y_true, y_pred, + multioutput=[0.4, 0.6]) rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6]) evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6]) assert_almost_equal(msew, 0.39, decimal=2) assert_almost_equal(rmsew, 0.62, decimal=2) assert_almost_equal(maew, 0.475, decimal=3) + assert_almost_equal(mapew, 12.025, decimal=2) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) From 0fca06e2c67ddf8d0a71d0ee0d22029b55f3be5c Mon Sep 17 00:00:00 2001 From: ashutosh_hathidara Date: Wed, 18 Sep 2019 11:37:36 +0530 Subject: [PATCH 002/103] Added mean_absolute_percentage_error in metrics --- sklearn/metrics/regression.py | 3 ++- sklearn/metrics/tests/test_regression.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index ecc2821449bf0..ee96ef87a5056 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -248,7 +248,8 @@ def mean_absolute_percentage_error(y_true, y_pred, y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) - output_errors = np.average(np.abs((y_pred - y_true) / (1 + np.abs(y_true))), + mape = np.abs((y_pred - y_true) / (1 + np.abs(y_true))) + output_errors = np.average(mape, weights=sample_weight, axis=0) * 100.0 if isinstance(multioutput, str): if multioutput == 'raw_values': diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index ade9e23225a99..c638da5a211f6 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -33,7 +33,7 @@ def test_regression_metrics(n_samples=50): np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) - assert_almost_equal(mean_absolute_percentage_error(y_true,y_pred), + assert_almost_equal(mean_absolute_percentage_error(y_true, y_pred), 8.99, decimal=2) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) @@ -190,13 +190,14 @@ def test_regression_multioutput_array(): mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') - mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') + mape = mean_absolute_percentage_error(y_true, y_pred, + multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) - assert_array_almost_equal(mape, [ 5.844, 16.145], decimal=2) + assert_array_almost_equal(mape, [5.844, 16.145], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) From 7191b88757ad5c2d27c7ecbf260358c44ab96b8a Mon Sep 17 00:00:00 2001 From: ashutosh_hathidara Date: Wed, 18 Sep 2019 11:52:58 +0530 Subject: [PATCH 003/103] Added mean_absolute_percentage_error in metrics --- sklearn/metrics/tests/test_regression.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index c638da5a211f6..1eea82beebe33 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -190,6 +190,8 @@ def test_regression_multioutput_array(): mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') + + # mean_absolute_percentage_error tests mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') From 140afe298ab834b5c71e528b01481af9a8338f9a Mon Sep 17 00:00:00 2001 From: ashutosh_hathidara Date: Wed, 18 Sep 2019 13:54:49 +0530 Subject: [PATCH 004/103] Added MAPE --- sklearn/metrics/tests/test_regression.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 1eea82beebe33..e4f72da885a02 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -33,7 +33,9 @@ def test_regression_metrics(n_samples=50): np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) - assert_almost_equal(mean_absolute_percentage_error(y_true, y_pred), + assert_almost_equal(np.around(mean_absolute_percentage_error(y_true, + y_pred), + decimals=2), 8.99, decimal=2) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) @@ -77,7 +79,8 @@ def test_multioutput_regression(): error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) - error = mean_absolute_percentage_error(y_true, y_pred) + error = np.around(mean_absolute_percentage_error(y_true, y_pred), + decimals=2) assert_almost_equal(error, 24.99, decimal=2) error = r2_score(y_true, y_pred, multioutput='variance_weighted') @@ -190,8 +193,6 @@ def test_regression_multioutput_array(): mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') - - # mean_absolute_percentage_error tests mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') From 41c1bd1317bc3e87f1ff8b9c07abf03d4bae4edd Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 14:15:44 +0530 Subject: [PATCH 005/103] Added MAPE --- sklearn/metrics/tests/test_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index e4f72da885a02..4bbe27d3b2408 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2), - 8.99, decimal=2) + 9.0, decimal=2) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) @@ -81,7 +81,7 @@ def test_multioutput_regression(): error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) - assert_almost_equal(error, 24.99, decimal=2) + assert_almost_equal(error, 25.0, decimal=2) error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) From a401965308bb96076d526b3d25f6301e61c58a71 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 14:37:05 +0530 Subject: [PATCH 006/103] Added MAPE --- sklearn/metrics/regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index ee96ef87a5056..8f5edaccc9f23 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -222,8 +222,8 @@ def mean_absolute_percentage_error(y_true, y_pred, Returns ------- loss : float or ndarray of floats - If multioutput is 'raw_values', then mean absolute percentage error is returned - for each output separately. + If multioutput is 'raw_values', then mean absolute percentage error + is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the weighted average of all output errors is returned. From 39a7af04e4289114973a9d68ff434d3dccfa99a5 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 14:40:51 +0530 Subject: [PATCH 007/103] Added MAPE --- sklearn/metrics/regression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 8f5edaccc9f23..cca4febd1c954 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -222,7 +222,7 @@ def mean_absolute_percentage_error(y_true, y_pred, Returns ------- loss : float or ndarray of floats - If multioutput is 'raw_values', then mean absolute percentage error + If multioutput is 'raw_values', then mean absolute percentage error is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the weighted average of all output errors is returned. @@ -240,7 +240,8 @@ def mean_absolute_percentage_error(y_true, y_pred, >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> mean_absolute_percentage_error(y_true, y_pred) 26.68... - >>> mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') + >>> mean_absolute_percentage_error(y_true, y_pred, + multioutput='raw_values') array([15.27777778, 38.0952381 ]) >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) 31.24... From 0aa9b53c235762417f9299c38268e339f0173e48 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 14:58:02 +0530 Subject: [PATCH 008/103] Added MAPE --- sklearn/metrics/regression.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index cca4febd1c954..d91203563eebf 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -240,9 +240,6 @@ def mean_absolute_percentage_error(y_true, y_pred, >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> mean_absolute_percentage_error(y_true, y_pred) 26.68... - >>> mean_absolute_percentage_error(y_true, y_pred, - multioutput='raw_values') - array([15.27777778, 38.0952381 ]) >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) 31.24... """ From b8f5187e810e3bcd7543a596a3a8378a1747b0b4 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 16:47:40 +0530 Subject: [PATCH 009/103] MAPE implementation changed --- sklearn/metrics/regression.py | 11 +++++++---- sklearn/metrics/tests/test_regression.py | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index d91203563eebf..50c566b8112c5 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -235,18 +235,21 @@ def mean_absolute_percentage_error(y_true, y_pred, >>> y_true = [3, -0.5, 2, 7] >>> y_pred = [2.5, 0.0, 2, 8] >>> mean_absolute_percentage_error(y_true, y_pred) - 14.58... + 32.73... >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> mean_absolute_percentage_error(y_true, y_pred) - 26.68... + 55.15... >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) - 31.24... + 61.98... """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) - mape = np.abs((y_pred - y_true) / (1 + np.abs(y_true))) + epsilon = np.finfo(float).eps + a_max = np.max(np.abs(y_true)) + denom = np.clip(np.abs(y_true), epsilon, a_max) + mape = np.abs((y_pred - y_true) / denom) output_errors = np.average(mape, weights=sample_weight, axis=0) * 100.0 if isinstance(multioutput, str): diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 4bbe27d3b2408..c6c29bc55606d 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2), - 9.0, decimal=2) + 900e+13, decimal=-13) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) @@ -81,7 +81,7 @@ def test_multioutput_regression(): error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) - assert_almost_equal(error, 25.0, decimal=2) + assert_almost_equal(error, 375e+14, decimal=-14) error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) @@ -200,7 +200,7 @@ def test_regression_multioutput_array(): assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) - assert_array_almost_equal(mape, [5.844, 16.145], decimal=2) + assert_array_almost_equal(mape, [7.78, 22.62], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) @@ -260,7 +260,7 @@ def test_regression_custom_weights(): assert_almost_equal(msew, 0.39, decimal=2) assert_almost_equal(rmsew, 0.62, decimal=2) assert_almost_equal(maew, 0.475, decimal=3) - assert_almost_equal(mapew, 12.025, decimal=2) + assert_almost_equal(mapew, 16.68, decimal=2) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) From f83fdf4b1254211ca702b704af1a75b5638cfe4c Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Wed, 18 Sep 2019 17:11:12 +0530 Subject: [PATCH 010/103] MAPE implementation changed --- sklearn/metrics/regression.py | 1 + sklearn/metrics/tests/test_regression.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 50c566b8112c5..f33de7cb1b6fc 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -248,6 +248,7 @@ def mean_absolute_percentage_error(y_true, y_pred, check_consistent_length(y_true, y_pred, sample_weight) epsilon = np.finfo(float).eps a_max = np.max(np.abs(y_true)) + a_max = max(epsilon, a_max) denom = np.clip(np.abs(y_true), epsilon, a_max) mape = np.abs((y_pred - y_true) / denom) output_errors = np.average(mape, diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index c6c29bc55606d..5bf91269265d5 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2), - 900e+13, decimal=-13) + 9007199254741000.0, decimal=-13) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) From 6b2ead24693402c939dba1ae321bff8620402f79 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Thu, 19 Sep 2019 10:20:18 +0530 Subject: [PATCH 011/103] Removed Clip and applied np.maximum --- sklearn/metrics/regression.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index f33de7cb1b6fc..fcf11ef74e2f8 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -247,10 +247,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) epsilon = np.finfo(float).eps - a_max = np.max(np.abs(y_true)) - a_max = max(epsilon, a_max) - denom = np.clip(np.abs(y_true), epsilon, a_max) - mape = np.abs((y_pred - y_true) / denom) + mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) output_errors = np.average(mape, weights=sample_weight, axis=0) * 100.0 if isinstance(multioutput, str): From 2c7c8a5f1e2962a180b9e438dc0671f9c8a01daa Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Fri, 20 Sep 2019 16:36:49 +0530 Subject: [PATCH 012/103] MAPE Added in Docs --- doc/modules/classes.rst | 1 + doc/modules/model_evaluation.rst | 32 +++++++++++++++++++++ doc/whats_new/_contributors.rst | 2 ++ doc/whats_new/v0.16.rst | 3 ++ sklearn/metrics/tests/test_score_objects.py | 2 ++ 5 files changed, 40 insertions(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 16658a39b1612..276cd47b66a5c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -914,6 +914,7 @@ details. metrics.mean_squared_error metrics.mean_squared_log_error metrics.median_absolute_error + metrics.mean_absolute_percentage_error metrics.r2_score metrics.mean_poisson_deviance metrics.mean_gamma_deviance diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 56fba2e2d5f5a..1799f86136b9b 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -95,6 +95,7 @@ Scoring Function 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` 'neg_median_absolute_error' :func:`metrics.median_absolute_error` +'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` 'r2' :func:`metrics.r2_score` 'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` 'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` @@ -1936,6 +1937,37 @@ function:: >>> mean_squared_log_error(y_true, y_pred) 0.044... +.. _mean_absolute_percentage_error: + +Mean absolute percentage error +------------------------------ +The :func:`mean_absolute_percentage_error` is particularly interesting because +it is directly understood by the users who don't have technical background. +It is modification of `mean_absolute_error`. The loss is calculated by taking +average of all differences between the target and the prediction divided by +the target and multiplying resulting term by 100.0 to convert it into percentage. + +If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample +and :math:`y_i` is the corresponding true value, then the mean absolute percentage +error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as + +.. math:: + + \text{MAPE}(y, \hat{y}) = \frac{\mid{y} - \hat{y}\mid}{\text{MAX}(\epsilon, \mid{y}\mid)} +Where :math:`\epsilon` is a very small number greater than zero and kept in order to avoid +the crash when y is zero. + +The :func:`mean_absolute_percentage_error` support multioutput. + +Here is a small example of usage of the :func:`mean_absolute_percentage_error` +function:: + + >>> from sklearn.metrics import mean_absolute_percentage_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> mean_absolute_percentage_error(y_true, y_pred) + 32.73... + .. _median_absolute_error: Median absolute error diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index bc99bc89f572d..549655e606da7 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -175,3 +175,5 @@ .. _Thomas Fan: https://github.com/thomasjpfan .. _Nicolas Hug: https://github.com/NicolasHug + +.. _Ashutosh Hathidara: https://github.com/ashutosh1919 \ No newline at end of file diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst index 931c7e0fbb923..3477e620f7c50 100644 --- a/doc/whats_new/v0.16.rst +++ b/doc/whats_new/v0.16.rst @@ -118,6 +118,9 @@ New features - Added :func:`metrics.median_absolute_error`, a robust metric. By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. +- Added :func:`metrics.mean_absolute_percentage_error`, a robust metric. + By :user:`Ashutosh Hathidara ` and `Alexandre Gramfort`_. + - Add :class:`cluster.Birch`, an online clustering algorithm. By `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 71f3c80c72409..7feca2b19bf00 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -41,10 +41,12 @@ REGRESSION_SCORERS = ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', + 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'mean_absolute_error', + 'mean_absolute_percentage_error', 'mean_squared_error', 'median_absolute_error', 'max_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance'] From 65afa120b265084d2c0ab5a7a86185bdec0399f6 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 24 Sep 2019 10:13:53 +0530 Subject: [PATCH 013/103] Changed model_evaluation descriptions and other changes --- doc/modules/model_evaluation.rst | 11 +++++------ doc/whats_new/v0.16.rst | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 1799f86136b9b..8e0ef19cd1aa4 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -95,7 +95,6 @@ Scoring Function 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` 'neg_median_absolute_error' :func:`metrics.median_absolute_error` -'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` 'r2' :func:`metrics.r2_score` 'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` 'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` @@ -1941,11 +1940,11 @@ function:: Mean absolute percentage error ------------------------------ -The :func:`mean_absolute_percentage_error` is particularly interesting because -it is directly understood by the users who don't have technical background. -It is modification of `mean_absolute_error`. The loss is calculated by taking -average of all differences between the target and the prediction divided by -the target and multiplying resulting term by 100.0 to convert it into percentage. +The :func: The `mean_absolute_percentage_error` (MAPE), also known as mean absolute +percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting +method in statistics, for example in trend estimation, also used as a loss function +for regression problems in machine learning. It is therefore blind to global scaling +of the target as long as the prediction is scaled like the true y. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample and :math:`y_i` is the corresponding true value, then the mean absolute percentage diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst index 3477e620f7c50..4eb4a10d82e92 100644 --- a/doc/whats_new/v0.16.rst +++ b/doc/whats_new/v0.16.rst @@ -119,7 +119,7 @@ New features By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. - Added :func:`metrics.mean_absolute_percentage_error`, a robust metric. - By :user:`Ashutosh Hathidara ` and `Alexandre Gramfort`_. + By :user:`Ashutosh Hathidara `. - Add :class:`cluster.Birch`, an online clustering algorithm. By `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. From 99d080d438ccfdfecbbdc1f325fc7c83c907e457 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 24 Sep 2019 11:18:34 +0530 Subject: [PATCH 014/103] Resolving error --- sklearn/metrics/tests/test_score_objects.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 7feca2b19bf00..04784dd78e7cb 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -41,7 +41,6 @@ REGRESSION_SCORERS = ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', - 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', From dc988ae1a8daba4dbacfdaf1a6dff63851b065a0 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 24 Sep 2019 19:47:26 +0530 Subject: [PATCH 015/103] model_evaluation table changed --- doc/modules/model_evaluation.rst | 75 ++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8e0ef19cd1aa4..dd89ad0d4f21b 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -58,47 +58,48 @@ of the metric. Scoring Function Comment ============================== ============================================= ================================== **Classification** -'accuracy' :func:`metrics.accuracy_score` -'balanced_accuracy' :func:`metrics.balanced_accuracy_score` -'average_precision' :func:`metrics.average_precision_score` -'brier_score_loss' :func:`metrics.brier_score_loss` -'f1' :func:`metrics.f1_score` for binary targets -'f1_micro' :func:`metrics.f1_score` micro-averaged -'f1_macro' :func:`metrics.f1_score` macro-averaged -'f1_weighted' :func:`metrics.f1_score` weighted average -'f1_samples' :func:`metrics.f1_score` by multilabel sample -'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support -'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' -'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' -'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' -'roc_auc' :func:`metrics.roc_auc_score` -'roc_auc_ovr' :func:`metrics.roc_auc_score` -'roc_auc_ovo' :func:`metrics.roc_auc_score` -'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` -'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` +'accuracy' :func:`metrics.accuracy_score` +'balanced_accuracy' :func:`metrics.balanced_accuracy_score` +'average_precision' :func:`metrics.average_precision_score` +'brier_score_loss' :func:`metrics.brier_score_loss` +'f1' :func:`metrics.f1_score` for binary targets +'f1_micro' :func:`metrics.f1_score` micro-averaged +'f1_macro' :func:`metrics.f1_score` macro-averaged +'f1_weighted' :func:`metrics.f1_score` weighted average +'f1_samples' :func:`metrics.f1_score` by multilabel sample +'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support +'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' +'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' +'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' +'roc_auc' :func:`metrics.roc_auc_score` +'roc_auc_ovr' :func:`metrics.roc_auc_score` +'roc_auc_ovo' :func:`metrics.roc_auc_score` +'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` +'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` **Clustering** -'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` -'adjusted_rand_score' :func:`metrics.adjusted_rand_score` -'completeness_score' :func:`metrics.completeness_score` -'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` -'homogeneity_score' :func:`metrics.homogeneity_score` -'mutual_info_score' :func:`metrics.mutual_info_score` -'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` -'v_measure_score' :func:`metrics.v_measure_score` +'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` +'adjusted_rand_score' :func:`metrics.adjusted_rand_score` +'completeness_score' :func:`metrics.completeness_score` +'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` +'homogeneity_score' :func:`metrics.homogeneity_score` +'mutual_info_score' :func:`metrics.mutual_info_score` +'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` +'v_measure_score' :func:`metrics.v_measure_score` **Regression** -'explained_variance' :func:`metrics.explained_variance_score` -'max_error' :func:`metrics.max_error` -'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` -'neg_median_absolute_error' :func:`metrics.median_absolute_error` -'r2' :func:`metrics.r2_score` -'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` -'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` -============================== ============================================= ================================== +'explained_variance' :func:`metrics.explained_variance_score` +'max_error' :func:`metrics.max_error` +'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` +'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` +'neg_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` +'neg_median_absolute_error' :func:`metrics.median_absolute_error` +'r2' :func:`metrics.r2_score` +'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` +'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` +================================== ========================================== ================================== Usage examples: From 8274e28efadd26a5680d91b2bdc556b7577d4006 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 24 Sep 2019 19:49:46 +0530 Subject: [PATCH 016/103] model_evaluation table changed --- doc/modules/model_evaluation.rst | 77 ++++++++++++++++---------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index dd89ad0d4f21b..070f0b1d04881 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -58,48 +58,49 @@ of the metric. Scoring Function Comment ============================== ============================================= ================================== **Classification** -'accuracy' :func:`metrics.accuracy_score` -'balanced_accuracy' :func:`metrics.balanced_accuracy_score` -'average_precision' :func:`metrics.average_precision_score` -'brier_score_loss' :func:`metrics.brier_score_loss` -'f1' :func:`metrics.f1_score` for binary targets -'f1_micro' :func:`metrics.f1_score` micro-averaged -'f1_macro' :func:`metrics.f1_score` macro-averaged -'f1_weighted' :func:`metrics.f1_score` weighted average -'f1_samples' :func:`metrics.f1_score` by multilabel sample -'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support -'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' -'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' -'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' -'roc_auc' :func:`metrics.roc_auc_score` -'roc_auc_ovr' :func:`metrics.roc_auc_score` -'roc_auc_ovo' :func:`metrics.roc_auc_score` -'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` -'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` +'accuracy' :func:`metrics.accuracy_score` +'balanced_accuracy' :func:`metrics.balanced_accuracy_score` +'average_precision' :func:`metrics.average_precision_score` +'brier_score_loss' :func:`metrics.brier_score_loss` +'f1' :func:`metrics.f1_score` for binary targets +'f1_micro' :func:`metrics.f1_score` micro-averaged +'f1_macro' :func:`metrics.f1_score` macro-averaged +'f1_weighted' :func:`metrics.f1_score` weighted average +'f1_samples' :func:`metrics.f1_score` by multilabel sample +'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support +'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' +'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' +'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' +'roc_auc' :func:`metrics.roc_auc_score` +'roc_auc_ovr' :func:`metrics.roc_auc_score` +'roc_auc_ovo' :func:`metrics.roc_auc_score` +'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` +'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` **Clustering** -'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` -'adjusted_rand_score' :func:`metrics.adjusted_rand_score` -'completeness_score' :func:`metrics.completeness_score` -'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` -'homogeneity_score' :func:`metrics.homogeneity_score` -'mutual_info_score' :func:`metrics.mutual_info_score` -'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` -'v_measure_score' :func:`metrics.v_measure_score` +'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` +'adjusted_rand_score' :func:`metrics.adjusted_rand_score` +'completeness_score' :func:`metrics.completeness_score` +'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` +'homogeneity_score' :func:`metrics.homogeneity_score` +'mutual_info_score' :func:`metrics.mutual_info_score` +'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` +'v_measure_score' :func:`metrics.v_measure_score` **Regression** -'explained_variance' :func:`metrics.explained_variance_score` -'max_error' :func:`metrics.max_error` -'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` -'neg_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` -'neg_median_absolute_error' :func:`metrics.median_absolute_error` -'r2' :func:`metrics.r2_score` -'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` -'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` -================================== ========================================== ================================== +'explained_variance' :func:`metrics.explained_variance_score` +'max_error' :func:`metrics.max_error` +'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` +'neg_mean_absolute_percentage_error' + :func:`metrics.mean_absolute_percentage_error` +'neg_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` +'neg_median_absolute_error' :func:`metrics.median_absolute_error` +'r2' :func:`metrics.r2_score` +'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` +'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` +============================== ============================================= ================================== Usage examples: From 647ec2cb348f729c9c2e596b738ee172d70d8589 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 24 Sep 2019 20:19:56 +0530 Subject: [PATCH 017/103] model_evaluation table changed --- sklearn/metrics/tests/test_score_objects.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 04784dd78e7cb..7feca2b19bf00 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -41,6 +41,7 @@ REGRESSION_SCORERS = ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', + 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', From 0f53d4f03da4dd8dde0017eda2ea8ffc6c43f036 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Thu, 26 Sep 2019 14:52:39 +0530 Subject: [PATCH 018/103] Changed Doc line --- doc/modules/model_evaluation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 26deda21183da..df0df661b069d 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1945,8 +1945,8 @@ Mean absolute percentage error The :func: The `mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting method in statistics, for example in trend estimation, also used as a loss function -for regression problems in machine learning. It is therefore blind to global scaling -of the target as long as the prediction is scaled like the true y. +for regression problems in machine learning. It is blind to global scaling of the +target as long as the prediction is scaled like the true y. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample and :math:`y_i` is the corresponding true value, then the mean absolute percentage From cdb5d09ca063fe8fdaf4d85cc05e969e29a51f4f Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Sun, 17 Nov 2019 18:08:49 +0530 Subject: [PATCH 019/103] metrics init file changed --- sklearn/metrics/__init__.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index f84a10748991d..1fb627a76332a 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -60,19 +60,6 @@ from .pairwise import pairwise_kernels from .pairwise import pairwise_distances_chunked -<<<<<<< HEAD -from .regression import explained_variance_score -from .regression import max_error -from .regression import mean_absolute_error -from .regression import mean_squared_error -from .regression import mean_squared_log_error -from .regression import median_absolute_error -from .regression import mean_absolute_percentage_error -from .regression import r2_score -from .regression import mean_tweedie_deviance -from .regression import mean_poisson_deviance -from .regression import mean_gamma_deviance -======= from ._regression import explained_variance_score from ._regression import max_error from ._regression import mean_absolute_error @@ -83,7 +70,6 @@ from ._regression import mean_tweedie_deviance from ._regression import mean_poisson_deviance from ._regression import mean_gamma_deviance ->>>>>>> upstream/master from ._scorer import check_scoring From 3ec0dd8c5f1b6ef393f1f6ad321d7d1c7f2d77dd Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Sun, 17 Nov 2019 18:11:26 +0530 Subject: [PATCH 020/103] test_regression resolved --- sklearn/metrics/tests/test_regression.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 8ea652b3efa23..b30259901d3fe 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -79,14 +79,11 @@ def test_multioutput_regression(): error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) -<<<<<<< HEAD error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) assert_almost_equal(error, 375e+14, decimal=-14) -======= error = median_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 1.) / 4.) ->>>>>>> upstream/master error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) From cf54616f8e53379184820678fd03bdb37309df50 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Sun, 17 Nov 2019 18:36:04 +0530 Subject: [PATCH 021/103] test_regression resolved --- sklearn/metrics/__init__.py | 1 + sklearn/metrics/_regression.py | 1 + 2 files changed, 2 insertions(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 1fb627a76332a..cfad24409d47c 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -66,6 +66,7 @@ from ._regression import mean_squared_error from ._regression import mean_squared_log_error from ._regression import median_absolute_error +from ._regression import mean_absolute_percentage_error from ._regression import r2_score from ._regression import mean_tweedie_deviance from ._regression import mean_poisson_deviance diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 2940b696c3d58..2384825951bf3 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -39,6 +39,7 @@ "mean_squared_error", "mean_squared_log_error", "median_absolute_error", + "mean_absolute_percentage_error", "r2_score", "explained_variance_score", "mean_tweedie_deviance", From b4d7336cba95a45c5127374f38b00fe69e4a18a4 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Sun, 29 Dec 2019 21:26:42 +0530 Subject: [PATCH 022/103] Render error --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 9150b5444485d..62baa50386b93 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -91,7 +91,7 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` +'mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` From 76a4bf97783425e7bc0dd657dfde8893dc3efb4d Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Mon, 6 Jan 2020 16:58:42 +0530 Subject: [PATCH 023/103] Resolving render errors --- doc/modules/model_evaluation.rst | 3 ++- doc/whats_new/_contributors.rst | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 62baa50386b93..ec8bfe4252751 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -91,7 +91,8 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` +'neg_mean_absolute_percentage_error' + :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index f5d650b19da20..a56e75ab176b0 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -176,8 +176,6 @@ .. _Nicolas Hug: https://github.com/NicolasHug -<<<<<<< HEAD .. _Ashutosh Hathidara: https://github.com/ashutosh1919 -======= -.. _Guillaume Lemaitre: https://github.com/glemaitre ->>>>>>> upstream/master + +.. _Guillaume Lemaitre: https://github.com/glemaitre \ No newline at end of file From 98bed82a5a0c6157582d603938ccc60e9fdf7cc6 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Mon, 6 Jan 2020 18:25:27 +0530 Subject: [PATCH 024/103] Render doc error --- doc/modules/model_evaluation.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index ec8bfe4252751..5ab44b7eb6a09 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -28,6 +28,8 @@ predictions: Finally, :ref:`dummy_estimators` are useful to get a baseline value of those metrics for random predictions. +.. |neg_mape| replace:: 'neg_mean_absolute_percentage_error' + .. seealso:: For "pairwise" metrics, between *samples* and not estimators or @@ -91,8 +93,7 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_absolute_percentage_error' - :func:`metrics.mean_absolute_percentage_error` +|neg_mape| :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` @@ -1978,6 +1979,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as .. math:: \text{MAPE}(y, \hat{y}) = \frac{\mid{y} - \hat{y}\mid}{\text{MAX}(\epsilon, \mid{y}\mid)} + Where :math:`\epsilon` is a very small number greater than zero and kept in order to avoid the crash when y is zero. From 9e0a347ffa98bc170585b2920d860dd7f26339c1 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Mon, 6 Jan 2020 18:49:19 +0530 Subject: [PATCH 025/103] Resolving render error --- doc/modules/model_evaluation.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5ab44b7eb6a09..5d3a45a848bab 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -28,8 +28,6 @@ predictions: Finally, :ref:`dummy_estimators` are useful to get a baseline value of those metrics for random predictions. -.. |neg_mape| replace:: 'neg_mean_absolute_percentage_error' - .. seealso:: For "pairwise" metrics, between *samples* and not estimators or @@ -93,7 +91,7 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -|neg_mape| :func:`metrics.mean_absolute_percentage_error` +'neg_mape' :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` From 545bacfca6180f4aa9897590257b9dd981534d79 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Mon, 6 Jan 2020 19:13:50 +0530 Subject: [PATCH 026/103] Resolving render doc --- doc/modules/model_evaluation.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5d3a45a848bab..8d331ec7debfa 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -91,7 +91,8 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mape' :func:`metrics.mean_absolute_percentage_error` +'neg_mean_absolute_percentage_ +error' :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` From ba0b63b55a615110306c3130517e6278b5778b3c Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Mon, 6 Jan 2020 19:27:36 +0530 Subject: [PATCH 027/103] Resolving render doc --- doc/modules/model_evaluation.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8d331ec7debfa..997964fa49966 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -91,8 +91,7 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_absolute_percentage_ -error' :func:`metrics.mean_absolute_percentage_error` +'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` From b731255742d1f6f5a2299a849f99a7eb5ffa366d Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 09:55:30 +0530 Subject: [PATCH 028/103] Resolving render doc --- doc/modules/model_evaluation.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 997964fa49966..1d78dfbf96aef 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -91,7 +91,6 @@ Scoring Function 'explained_variance' :func:`metrics.explained_variance_score` 'max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` From 197f576dc7ba86c672b889ff38cff5647b9bce33 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 7 Jan 2020 14:41:49 +0530 Subject: [PATCH 029/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 1d78dfbf96aef..fbdb8e985ac29 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1980,7 +1980,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as Where :math:`\epsilon` is a very small number greater than zero and kept in order to avoid the crash when y is zero. -The :func:`mean_absolute_percentage_error` support multioutput. +The :func:`mean_absolute_percentage_error` function supports multioutput. Here is a small example of usage of the :func:`mean_absolute_percentage_error` function:: From 5869ddaefa7516d229a44dc7ee061a3bb55d6f10 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 7 Jan 2020 14:42:55 +0530 Subject: [PATCH 030/103] Update sklearn/metrics/_regression.py Co-Authored-By: Alexandre Gramfort --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 0ddaca0fda68d..d8804ffc8dc24 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -199,7 +199,7 @@ def mean_absolute_percentage_error(y_true, y_pred, Parameters ---------- - y_true : array-like of shape = (n_samples) or (n_samples, n_outputs) + y_true : array-like of shape = (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) From 1adc6355f1c8641904bf646fa1674494e428661d Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 7 Jan 2020 14:43:25 +0530 Subject: [PATCH 031/103] Update sklearn/metrics/_regression.py Co-Authored-By: Alexandre Gramfort --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index d8804ffc8dc24..14c7e0e20e9e6 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -202,7 +202,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_true : array-like of shape = (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. - y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) + y_pred : array-like of shape = (n_samples,) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape = (n_samples), optional From 5f25b68c85911806d9b4e88b3693b3606787ce62 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 15:10:53 +0530 Subject: [PATCH 032/103] Applying suggested changes --- doc/whats_new/v0.16.rst | 2 +- sklearn/metrics/_regression.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst index 4eb4a10d82e92..56992c634c8f9 100644 --- a/doc/whats_new/v0.16.rst +++ b/doc/whats_new/v0.16.rst @@ -118,7 +118,7 @@ New features - Added :func:`metrics.median_absolute_error`, a robust metric. By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. -- Added :func:`metrics.mean_absolute_percentage_error`, a robust metric. +- Added :func:`metrics.mean_absolute_percentage_error`. By :user:`Ashutosh Hathidara `. - Add :class:`cluster.Birch`, an online clustering algorithm. By diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 14c7e0e20e9e6..74ffe625bc5fe 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -205,11 +205,11 @@ def mean_absolute_percentage_error(y_true, y_pred, y_pred : array-like of shape = (n_samples,) or (n_samples, n_outputs) Estimated target values. - sample_weight : array-like of shape = (n_samples), optional + sample_weight : array-like of shape = (n_samples,), optional Sample weights. multioutput : string in ['raw_values', 'uniform_average'] - or array-like of shape (n_outputs) + or array-like of shape (n_outputs,) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. From c28a2f5bde7cd1d860f1158d9a20a9d203a0f4aa Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 15:58:51 +0530 Subject: [PATCH 033/103] Applying suggested changes --- sklearn/metrics/_regression.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 74ffe625bc5fe..5a201f7bf88dc 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -208,8 +208,7 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape = (n_samples,), optional Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] - or array-like of shape (n_outputs,) + multioutput : string in ['raw_values', 'uniform_average'] or list of shape (n_outputs,) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. From 5444ac68aea4ba37a5aeb11bd6f77b67d714a943 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 18:10:22 +0530 Subject: [PATCH 034/103] Applying suggested changes --- sklearn/metrics/_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 5a201f7bf88dc..a3da50f276b61 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -208,7 +208,8 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape = (n_samples,), optional Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] or list of shape (n_outputs,) + multioutput : string in ['raw_values', 'uniform_average'] + or list of shape (n_outputs,) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. From 5dbe07df23cc76ba9dc621f13ab8b139b581f494 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 18:17:06 +0530 Subject: [PATCH 035/103] Applying suggested changes --- sklearn/metrics/_regression.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index a3da50f276b61..5a201f7bf88dc 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -208,8 +208,7 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape = (n_samples,), optional Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] - or list of shape (n_outputs,) + multioutput : string in ['raw_values', 'uniform_average'] or list of shape (n_outputs,) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. From 90f7533eb0d6ff7a04aa3090536d143fe774014c Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 18:19:57 +0530 Subject: [PATCH 036/103] Applying suggested changes --- sklearn/metrics/_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 5a201f7bf88dc..cef09e4ed6722 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -208,7 +208,8 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape = (n_samples,), optional Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] or list of shape (n_outputs,) + multioutput : string in ['raw_values', 'uniform_average'] or + : list of shape (n_outputs,) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. From 58970979940b1707e71b9bcf9c5aeef5d3a8c4c7 Mon Sep 17 00:00:00 2001 From: ashutosh1919 Date: Tue, 7 Jan 2020 18:33:17 +0530 Subject: [PATCH 037/103] Applying suggested changes --- sklearn/metrics/_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index cef09e4ed6722..dd1d242a19c64 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -208,10 +208,10 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape = (n_samples,), optional Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] or - : list of shape (n_outputs,) + multioutput : string in ['raw_values', 'uniform_average'] or list. Defines aggregating of multiple output values. Array-like value defines weights used to average errors. + If input is list then the shape must be (n_outputs,). 'raw_values' : Returns a full set of errors in case of multioutput input. From 3962b9a396cc4943cb06887dc3421166006e3768 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:15:52 +0530 Subject: [PATCH 038/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index fbdb8e985ac29..2c095b867ce42 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1963,7 +1963,7 @@ function:: Mean absolute percentage error ------------------------------ -The :func: The `mean_absolute_percentage_error` (MAPE), also known as mean absolute +The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a regression method in statistics, for example in price prediction, also used as a loss function for regression problems in machine learning. It is blind to global scaling of the From aa8f6ec8ad47e0ad97a5c2b3fd42bd737c5a5003 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:16:13 +0530 Subject: [PATCH 039/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 2c095b867ce42..82330e36884e4 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1964,7 +1964,7 @@ function:: Mean absolute percentage error ------------------------------ The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute -percentage deviation (MAPD), is a measure of prediction accuracy of a regression +percentage deviation (MAPD), is an evaluation metric for regression problems. method in statistics, for example in price prediction, also used as a loss function for regression problems in machine learning. It is blind to global scaling of the target as long as the prediction is scaled like the true y. From fdc197fd807e62e090cef5089cf70ba4125f5443 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:16:34 +0530 Subject: [PATCH 040/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 82330e36884e4..6bcb69b9d3cca 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1965,7 +1965,6 @@ Mean absolute percentage error ------------------------------ The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is an evaluation metric for regression problems. -method in statistics, for example in price prediction, also used as a loss function for regression problems in machine learning. It is blind to global scaling of the target as long as the prediction is scaled like the true y. From 94b6b5bd9a94ad1f813d732659ed6bc392141d95 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:16:53 +0530 Subject: [PATCH 041/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 6bcb69b9d3cca..4ed994295141f 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1965,7 +1965,7 @@ Mean absolute percentage error ------------------------------ The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is an evaluation metric for regression problems. -for regression problems in machine learning. It is blind to global scaling of the +The idea of this metric is to be sensitive to relative errors. It is for example not changed by a global scaling of the target variable. target as long as the prediction is scaled like the true y. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample From 595a0c271499e0ba564e36394860b03cc825fabe Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:17:10 +0530 Subject: [PATCH 042/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 4ed994295141f..08b30163f216e 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1966,7 +1966,6 @@ Mean absolute percentage error The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is an evaluation metric for regression problems. The idea of this metric is to be sensitive to relative errors. It is for example not changed by a global scaling of the target variable. -target as long as the prediction is scaled like the true y. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample and :math:`y_i` is the corresponding true value, then the mean absolute percentage From a64e5ff4aae34094b4b625bfb42f6dc07c48432b Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:17:42 +0530 Subject: [PATCH 043/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 08b30163f216e..3a36ce925d628 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1975,7 +1975,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as \text{MAPE}(y, \hat{y}) = \frac{\mid{y} - \hat{y}\mid}{\text{MAX}(\epsilon, \mid{y}\mid)} -Where :math:`\epsilon` is a very small number greater than zero and kept in order to avoid +where :math:`\epsilon` is a very small number greater than zero to avoid the crash when y is zero. The :func:`mean_absolute_percentage_error` function supports multioutput. From 10979113a2e3f2470806a8b4a1d40b6c7a41c3b9 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:18:07 +0530 Subject: [PATCH 044/103] Update doc/modules/model_evaluation.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 3a36ce925d628..f7c0ebf98be15 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1973,7 +1973,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as .. math:: - \text{MAPE}(y, \hat{y}) = \frac{\mid{y} - \hat{y}\mid}{\text{MAX}(\epsilon, \mid{y}\mid)} + \text{MAPE}(y, \hat{y}) = \frac{\mid y - \hat{y} \mid}{\text{MAX}(\epsilon, \mid y \mid)} where :math:`\epsilon` is a very small number greater than zero to avoid the crash when y is zero. From 9782f34e59b0b956cf6dd99b34aea28c79379990 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 7 Feb 2020 22:22:24 +0530 Subject: [PATCH 045/103] Doc Too long line error resolved --- doc/modules/model_evaluation.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index f7c0ebf98be15..c421a3e6cf1e7 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1965,7 +1965,8 @@ Mean absolute percentage error ------------------------------ The :func:`mean_absolute_percentage_error` (MAPE), also known as mean absolute percentage deviation (MAPD), is an evaluation metric for regression problems. -The idea of this metric is to be sensitive to relative errors. It is for example not changed by a global scaling of the target variable. +The idea of this metric is to be sensitive to relative errors. It is for example +not changed by a global scaling of the target variable. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample and :math:`y_i` is the corresponding true value, then the mean absolute percentage From dbc2c4a3a62b47313a6faf623ffd0d424c2cd07f Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 16:12:05 +0530 Subject: [PATCH 046/103] datatype changed and made compatible to y_true --- sklearn/metrics/_regression.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 0b0d236703615..4ca696e1ea954 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -219,7 +219,6 @@ def mean_absolute_percentage_error(y_true, y_pred, 'uniform_average' : Errors of all outputs are averaged with uniform weight. - Returns ------- loss : float or ndarray of floats @@ -247,7 +246,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) - epsilon = np.finfo(float).eps + epsilon = np.finfo(y_true.dtype).eps mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) output_errors = np.average(mape, weights=sample_weight, axis=0) * 100.0 From e4cd050626d60538aa990208542df07745bf9316 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 16:33:27 +0530 Subject: [PATCH 047/103] Added scorer --- sklearn/metrics/_regression.py | 1 + sklearn/metrics/_scorer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 4ca696e1ea954..9e348bd675842 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -20,6 +20,7 @@ # Michael Eickenberg # Konstantin Shmelkov # Christian Lorentzen +# Ashutosh Hathidara # License: BSD 3 clause diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 3df175c2ca306..f14c29cdc0bb3 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -31,7 +31,7 @@ f1_score, roc_auc_score, average_precision_score, precision_score, recall_score, log_loss, balanced_accuracy_score, explained_variance_score, - brier_score_loss, jaccard_score) + brier_score_loss, jaccard_score, mean_absolute_percentage_error) from .cluster import adjusted_rand_score from .cluster import homogeneity_score @@ -623,6 +623,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, greater_is_better=False) neg_mean_absolute_error_scorer = make_scorer(mean_absolute_error, greater_is_better=False) +neg_mape_scorer = make_scorer(mean_absolute_percentage_error, + greater_is_better=False) neg_median_absolute_error_scorer = make_scorer(median_absolute_error, greater_is_better=False) neg_root_mean_squared_error_scorer = make_scorer(mean_squared_error, @@ -687,6 +689,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, max_error=max_error_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, + neg_mape=neg_mape_scorer, neg_mean_squared_error=neg_mean_squared_error_scorer, neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, From c7a8b5f708a69c623db8a3121113111f9df3c781 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 16:35:23 +0530 Subject: [PATCH 048/103] eps datatype changed to np.float64 --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 9e348bd675842..aa8c100718b74 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -247,7 +247,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) - epsilon = np.finfo(y_true.dtype).eps + epsilon = np.finfo(np.float64).eps mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) output_errors = np.average(mape, weights=sample_weight, axis=0) * 100.0 From d4060811a4ba1b799b0a802b2c0500ebad2dd579 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 16:52:33 +0530 Subject: [PATCH 049/103] test_regression.py is changed to more meaningful test cases --- sklearn/metrics/tests/test_regression.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index ee69d45d8e05a..dbcf2404c0c31 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -33,10 +33,9 @@ def test_regression_metrics(n_samples=50): np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) - assert_almost_equal(np.around(mean_absolute_percentage_error(y_true, - y_pred), - decimals=2), - 9007199254741000.0, decimal=-13) + mape = mean_absolute_percentage_error(y_true, y_pred) + assert np.isfinite(mape) + assert mape>1e6 assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) @@ -93,7 +92,8 @@ def test_multioutput_regression(): error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) - assert_almost_equal(error, 375e+14, decimal=-14) + assert np.isfinite(error) + assert error>1e6 error = median_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 1.) / 4.) From 5db5f5eea5ffca8010f195d2f90226ddbed69146 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 16:59:02 +0530 Subject: [PATCH 050/103] test_regression.py is changed to more meaningful test cases --- sklearn/metrics/tests/test_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index dbcf2404c0c31..573f66c36511c 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -35,7 +35,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) mape = mean_absolute_percentage_error(y_true, y_pred) assert np.isfinite(mape) - assert mape>1e6 + assert mape > 1e6 assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) @@ -93,7 +93,7 @@ def test_multioutput_regression(): error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) assert np.isfinite(error) - assert error>1e6 + assert error > 1e6 error = median_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 1.) / 4.) From f4cfc22cf8017c782d6d7ca5e0e80ea4014db55a Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 17:13:10 +0530 Subject: [PATCH 051/103] Resolving errors related to scorer tests in test_common.py and _scorer.py --- sklearn/metrics/_scorer.py | 2 +- sklearn/metrics/tests/test_common.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index f14c29cdc0bb3..9ec4ccf2e848f 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -689,7 +689,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, max_error=max_error_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, - neg_mape=neg_mape_scorer, + neg_mean_absolute_percentage_error=neg_mape_scorer, neg_mean_squared_error=neg_mean_squared_error_scorer, neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 7301d21a35f39..e984536e7afe2 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -41,6 +41,7 @@ from sklearn.metrics import max_error from sklearn.metrics import matthews_corrcoef from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_absolute_percentage_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_tweedie_deviance from sklearn.metrics import mean_poisson_deviance @@ -98,6 +99,7 @@ "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, "median_absolute_error": median_absolute_error, + "mean_absolute_percentage_error": mean_absolute_percentage_error, "explained_variance_score": explained_variance_score, "r2_score": partial(r2_score, multioutput='variance_weighted'), "mean_normal_deviance": partial(mean_tweedie_deviance, power=0), @@ -472,7 +474,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "macro_f0.5_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "log_loss", "hinge_loss", "mean_gamma_deviance", "mean_poisson_deviance", - "mean_compound_poisson_deviance" + "mean_compound_poisson_deviance", "mean_absolute_percentage_error" } From b681dcbda10baec02fd88f544138aa0fe5a40ec0 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 17:26:32 +0530 Subject: [PATCH 052/103] Updated v0.23.rst in whats_new --- doc/whats_new/v0.16.rst | 3 --- doc/whats_new/v0.23.rst | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst index 56992c634c8f9..c5bbaa4a7e21a 100644 --- a/doc/whats_new/v0.16.rst +++ b/doc/whats_new/v0.16.rst @@ -117,9 +117,6 @@ New features - Added :func:`metrics.median_absolute_error`, a robust metric. By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. - -- Added :func:`metrics.mean_absolute_percentage_error`. - By :user:`Ashutosh Hathidara `. - Add :class:`cluster.Birch`, an online clustering algorithm. By `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e727f763583e6..ac9874a2b82f9 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -217,6 +217,12 @@ Changelog - |Fix| Fixed a bug in :func:`metrics.mutual_info_score` where negative scores could be returned. :pr:`16362` by `Thomas Fan`_. +- |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and + the associated scorer for regression problems. :issue:`10708` fixed with the + PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and + some practical test cases were taken from PR :pr:`10711` by + :user:`Mohamed Ali Jamaoui `. + :mod:`sklearn.model_selection` .............................. From d4fcc39679cf5a64ff96a99a3c03cccdef82a63e Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 18:23:04 +0530 Subject: [PATCH 053/103] resolving errors of mape scorer --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index e984536e7afe2..eb479a3b0c13c 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -427,7 +427,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): # Regression metrics with "multioutput-continuous" format support MULTIOUTPUT_METRICS = { "mean_absolute_error", "median_absolute_error", "mean_squared_error", - "r2_score", "explained_variance_score" + "r2_score", "explained_variance_score", "mean_absolute_percentage_error" } # Symmetric with respect to their input arguments y_true and y_pred From 9261cee238e3d8c579b18a2da5b026897f739a61 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 18:57:32 +0530 Subject: [PATCH 054/103] resolving errors of mape scorer --- sklearn/metrics/tests/test_common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index eb479a3b0c13c..705da12808213 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1373,7 +1373,11 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name): y_true_perm = y_true[:, perm] current_score = metric(y_true_perm, y_score_perm) - assert_almost_equal(score, current_score) + if name=="mean_absolute_percentage_error": + assert np.isfinite(current_score) + assert current_score > 1e6 + else: + assert_almost_equal(score, current_score) @pytest.mark.parametrize( From a161f79ec7f890e06ce75655599af13495cba5b4 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 19:06:09 +0530 Subject: [PATCH 055/103] resolving errors of mape scorer --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 705da12808213..8aba31c2746a4 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1373,7 +1373,7 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name): y_true_perm = y_true[:, perm] current_score = metric(y_true_perm, y_score_perm) - if name=="mean_absolute_percentage_error": + if metric==mean_absolute_percentage_error: assert np.isfinite(current_score) assert current_score > 1e6 else: From d102f86cc63db1d029fd08ef1cab0686049005f8 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 19:07:44 +0530 Subject: [PATCH 056/103] resolving errors of mape scorer --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 8aba31c2746a4..9eb0c42227ad1 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1373,7 +1373,7 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name): y_true_perm = y_true[:, perm] current_score = metric(y_true_perm, y_score_perm) - if metric==mean_absolute_percentage_error: + if metric == mean_absolute_percentage_error: assert np.isfinite(current_score) assert current_score > 1e6 else: From 1970a3014b8b81be55925762fd75956c95e55372 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 20 Feb 2020 19:38:28 +0530 Subject: [PATCH 057/103] modified test case in model_evaluation.rst --- doc/modules/model_evaluation.rst | 6 +++--- doc/whats_new/v0.16.rst | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c529a0b73bf57..36e00daf88d12 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1984,10 +1984,10 @@ Here is a small example of usage of the :func:`mean_absolute_percentage_error` function:: >>> from sklearn.metrics import mean_absolute_percentage_error - >>> y_true = [3, -0.5, 2, 7] - >>> y_pred = [2.5, 0.0, 2, 8] + >>> y_true = [1, 10, 1e6] + >>> y_pred = [0.9, 15, 1.2e6] >>> mean_absolute_percentage_error(y_true, y_pred) - 32.73... + 26.66... .. _median_absolute_error: diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst index c5bbaa4a7e21a..931c7e0fbb923 100644 --- a/doc/whats_new/v0.16.rst +++ b/doc/whats_new/v0.16.rst @@ -117,7 +117,7 @@ New features - Added :func:`metrics.median_absolute_error`, a robust metric. By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. - + - Add :class:`cluster.Birch`, an online clustering algorithm. By `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. From 2b4128d64fb73fc4c03f0131a9d75c06d7e8505a Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 21 Feb 2020 11:19:12 +0530 Subject: [PATCH 058/103] modified doc and code as per second batch comments --- azure-pipelines.yml | 6 + build_tools/azure/install.cmd | 4 +- build_tools/azure/install.sh | 6 +- build_tools/azure/posix-32.yml | 1 + doc/developers/develop.rst | 5 + doc/developers/tips.rst | 2 + doc/modules/clustering.rst | 13 +- doc/modules/model_evaluation.rst | 9 +- doc/whats_new/v0.23.rst | 41 +- setup.cfg | 2 +- setup.py | 4 +- sklearn/base.py | 1 + sklearn/calibration.py | 7 +- sklearn/cluster/_bicluster.py | 19 +- sklearn/cluster/_k_means_elkan.pyx | 770 +++++++++++++++++------ sklearn/cluster/_k_means_fast.pxd | 23 + sklearn/cluster/_k_means_fast.pyx | 507 ++++++++------- sklearn/cluster/_k_means_lloyd.pyx | 407 ++++++++++++ sklearn/cluster/_kmeans.py | 738 ++++++++++++---------- sklearn/cluster/setup.py | 16 +- sklearn/cluster/tests/test_bicluster.py | 16 +- sklearn/cluster/tests/test_k_means.py | 317 ++++++++-- sklearn/compose/_column_transformer.py | 62 +- sklearn/compose/_target.py | 15 +- sklearn/decomposition/_sparse_pca.py | 8 + sklearn/discriminant_analysis.py | 10 +- sklearn/dummy.py | 18 +- sklearn/ensemble/_bagging.py | 5 +- sklearn/ensemble/tests/test_bagging.py | 22 + sklearn/isotonic.py | 10 +- sklearn/kernel_approximation.py | 18 +- sklearn/metrics/_regression.py | 2 + sklearn/metrics/_scorer.py | 63 +- sklearn/metrics/tests/test_common.py | 4 + sklearn/metrics/tests/test_regression.py | 7 + sklearn/neural_network/_rbm.py | 8 + sklearn/svm/_classes.py | 9 + sklearn/tests/test_common.py | 11 +- sklearn/tree/_classes.py | 2 +- sklearn/utils/estimator_checks.py | 14 - sklearn/utils/sparsefuncs_fast.pyx | 12 +- sklearn/utils/tests/test_sparsefuncs.py | 16 +- sklearn/utils/validation.py | 2 +- 43 files changed, 2240 insertions(+), 992 deletions(-) create mode 100644 sklearn/cluster/_k_means_fast.pxd create mode 100644 sklearn/cluster/_k_means_lloyd.pyx diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 99431332be662..025fc86ed206e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -39,6 +39,7 @@ jobs: PILLOW_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' + THREADPOOLCTL_VERSION: '2.0.0' COVERAGE: 'true' - template: build_tools/azure/posix.yml @@ -54,6 +55,7 @@ jobs: DISTRIB: 'ubuntu' PYTHON_VERSION: '3.6' JOBLIB_VERSION: '0.11' + THREADPOOLCTL_VERSION: '2.0.0' # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB py36_conda_openblas: DISTRIB: 'conda' @@ -70,6 +72,7 @@ jobs: SCIKIT_IMAGE_VERSION: '*' # latest version of joblib available in conda for Python 3.6 JOBLIB_VERSION: '0.13.2' + THREADPOOLCTL_VERSION: '2.0.0' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. # It runs tests requiring lightgbm, pandas and PyAMG. @@ -92,6 +95,7 @@ jobs: DISTRIB: 'ubuntu-32' PYTHON_VERSION: '3.6' JOBLIB_VERSION: '0.13' + THREADPOOLCTL_VERSION: '2.0.0' - template: build_tools/azure/posix.yml parameters: @@ -109,6 +113,7 @@ jobs: PILLOW_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' + THREADPOOLCTL_VERSION: '2.0.0' COVERAGE: 'true' pylatest_conda_mkl_no_openmp: DISTRIB: 'conda' @@ -120,6 +125,7 @@ jobs: PILLOW_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' + THREADPOOLCTL_VERSION: '2.0.0' COVERAGE: 'true' SKLEARN_TEST_NO_OPENMP: 'true' SKLEARN_SKIP_OPENMP_TEST: 'true' diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 2566ba4f4f3aa..aa32e7cf2612d 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -15,6 +15,8 @@ IF "%PYTHON_ARCH%"=="64" ( call activate %VIRTUALENV% + pip install threadpoolctl + IF "%PYTEST_VERSION%"=="*" ( pip install pytest ) else ( @@ -22,7 +24,7 @@ IF "%PYTHON_ARCH%"=="64" ( ) pip install pytest-xdist ) else ( - pip install numpy scipy cython pytest wheel pillow joblib + pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl ) if "%COVERAGE%" == "true" ( pip install coverage codecov pytest-cov diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 1ec0b06497592..9c83acfa02efd 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -65,6 +65,8 @@ if [[ "$DISTRIB" == "conda" ]]; then make_conda $TO_INSTALL + pip install threadpoolctl==$THREADPOOLCTL_VERSION + if [[ "$PYTEST_VERSION" == "*" ]]; then python -m pip install pytest else @@ -81,13 +83,13 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate - python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION + python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION threadpoolctl==$THREADPOOLCTL_VERSION elif [[ "$DISTRIB" == "ubuntu-32" ]]; then apt-get update apt-get install -y python3-dev python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate - python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION + python -m pip install pytest==$PYTEST_VERSION pytest-cov cython joblib==$JOBLIB_VERSION threadpoolctl==$THREADPOOLCTL_VERSION elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then # Since conda main channel usually lacks behind on the latest releases, # we use pypi to test against the latest releases of the dependencies. diff --git a/build_tools/azure/posix-32.yml b/build_tools/azure/posix-32.yml index b4c9e4ebb287e..febc8acb3a1d3 100644 --- a/build_tools/azure/posix-32.yml +++ b/build_tools/azure/posix-32.yml @@ -36,6 +36,7 @@ jobs: -e JUNITXML=$JUNITXML -e VIRTUALENV=testvenv -e JOBLIB_VERSION=$JOBLIB_VERSION + -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION -e PYTEST_VERSION=$PYTEST_VERSION -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 5e11f46eccdb8..a9acfe2c0c0a7 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -527,6 +527,11 @@ _skip_test (default=``False``) whether to skip common tests entirely. Don't use this unless you have a *very good* reason. +_xfail_test (default=``False``) + dictionary ``{check_name : reason}`` of common checks to mark as a + known failure, with the associated reason. Don't use this unless you have a + *very good* reason. + stateless (default=``False``) whether the estimator needs access to data for fitting. Even though an estimator is stateless, it might still need a call to ``fit`` for diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst index 177f82983dfc1..b26d68ecfbe02 100644 --- a/doc/developers/tips.rst +++ b/doc/developers/tips.rst @@ -86,6 +86,8 @@ Other `pytest` options that may become useful include: - ``-s`` so that pytest does not capture the output of ``print()`` statements - ``--tb=short`` or ``--tb=line`` to control the length of the logs + - ``--runxfail`` also run tests marked as a known failure (XFAIL) and report + errors. Since our continuous integration tests will error if ``FutureWarning`` isn't properly caught, diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 5649c3f5237da..8196d9834ea51 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -205,12 +205,13 @@ computing cluster centers and values of inertia. For example, assigning a weight of 2 to a sample is equivalent to adding a duplicate of that sample to the dataset :math:`X`. -A parameter can be given to allow K-means to be run in parallel, called -``n_jobs``. Giving this parameter a positive value uses that many processors -(default: 1). A value of -1 uses all available processors, with -2 using one -less, and so on. Parallelization generally speeds up computation at the cost of -memory (in this case, multiple copies of centroids need to be stored, one for -each job). +Low-level parallelism +--------------------- + +:class:`KMeans` benefits from OpenMP based parallelism through Cython. Small +chunks of data (256 samples) are processed in parallel, which in addition +yields a low memory footprint. For more details on how to control the number of +threads, please refer to our :ref:`parallelism` notes. .. warning:: diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 36e00daf88d12..8864151afd9eb 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1975,8 +1975,8 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as \text{MAPE}(y, \hat{y}) = \frac{\mid y - \hat{y} \mid}{\text{MAX}(\epsilon, \mid y \mid)} -where :math:`\epsilon` is a very small number greater than zero to avoid -the crash when y is zero. +where :math:`\epsilon` is an arbitrary small yet strictly positive number to +avoid undefined results when y is zero. The :func:`mean_absolute_percentage_error` function supports multioutput. @@ -1989,6 +1989,11 @@ function:: >>> mean_absolute_percentage_error(y_true, y_pred) 26.66... +In above example, if we had used `mean_absolute_error`, it would have ignored +the small magnitude values and only reflected the error in prediction of highest +magnitude value. But that problem is resolved in case of MAPE because it calculates +relative percentage error with respect to actual output. + .. _median_absolute_error: Median absolute error diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index ac9874a2b82f9..258c24e74af1d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -23,7 +23,8 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- list models here +- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`, + and :class:`ensemble.IsolationForest`. |Fix| Details are listed in the changelog below. @@ -64,6 +65,26 @@ Changelog could not have a `np.int64` type. :pr:`16484` by :user:`Jeremie du Boisberranger `. +- |API| The ``n_jobs`` parameter of :class:`cluster.KMeans`, + :class:`cluster.SpectralCoclustering` and + :class:`cluster.SpectralBiclustering` is deprecated. They now use OpenMP + based parallelism. For more details on how to control the number of threads, + please refer to our :ref:`parallelism` notes. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + +- |API| The ``precompute_distances`` parameter of :class:`cluster.KMeans` is + deprecated. It has no effect. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + +- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more + optimized implementation. Parallelism is now over the data instead of over + initializations allowing better scalability. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + +- |Enhancement| :class:`cluster.KMeans` now supports sparse data when + `solver = "elkan"`. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + :mod:`sklearn.compose` ...................... @@ -140,6 +161,12 @@ Changelog samples in the training set. :pr:`14516` by :user:`Johann Faouzi `. +- |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`, + :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest` + where the attribute `estimators_samples_` did not generate the proper indices + used during `fit`. + :pr:`16437` by :user:`Jin-Hwan CHO `. + :mod:`sklearn.feature_extraction` ................................. @@ -210,6 +237,12 @@ Changelog :mod:`sklearn.metrics` ...................... +- |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and + the associated scorer for regression problems. :issue:`10708` fixed with the + PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and + some practical test cases were taken from PR :pr:`10711` by + :user:`Mohamed Ali Jamaoui `. + - |Fix| Fixed a bug in :func:`metrics.mean_squared_error` to not ignore argument `squared` when argument `multioutput='raw_values'`. :pr:`16323` by :user:`Rushabh Vasani ` @@ -217,12 +250,6 @@ Changelog - |Fix| Fixed a bug in :func:`metrics.mutual_info_score` where negative scores could be returned. :pr:`16362` by `Thomas Fan`_. -- |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and - the associated scorer for regression problems. :issue:`10708` fixed with the - PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and - some practical test cases were taken from PR :pr:`10711` by - :user:`Mohamed Ali Jamaoui `. - :mod:`sklearn.model_selection` .............................. diff --git a/setup.cfg b/setup.cfg index aab681328698d..f086993b26a29 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ addopts = --ignore maint_tools --doctest-modules --disable-pytest-warnings - -rs + -rxXs filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning diff --git a/setup.py b/setup.py index 798c8bfc5d305..90162b65644e5 100755 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ NUMPY_MIN_VERSION = '1.13.3' JOBLIB_MIN_VERSION = '0.11' +THREADPOOLCTL_MIN_VERSION = '2.0.0' # Optional setuptools features # We need to import setuptools early, if we want setuptools features, @@ -257,7 +258,8 @@ def setup_package(): install_requires=[ 'numpy>={}'.format(NUMPY_MIN_VERSION), 'scipy>={}'.format(SCIPY_MIN_VERSION), - 'joblib>={}'.format(JOBLIB_MIN_VERSION) + 'joblib>={}'.format(JOBLIB_MIN_VERSION), + 'threadpoolctl>={}'.format(THREADPOOLCTL_MIN_VERSION) ], package_data={'': ['*.pxd']}, **extra_setuptools_args) diff --git a/sklearn/base.py b/sklearn/base.py index be329c196abb5..e56e13872bffb 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -31,6 +31,7 @@ 'stateless': False, 'multilabel': False, '_skip_test': False, + '_xfail_test': False, 'multioutput_only': False, 'binary_only': False, 'requires_fit': True} diff --git a/sklearn/calibration.py b/sklearn/calibration.py index e90207bb5eca7..ff9c4b3e75c44 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -27,6 +27,7 @@ from .isotonic import IsotonicRegression from .svm import LinearSVC from .model_selection import check_cv +from .utils.validation import _deprecate_positional_args class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, @@ -98,7 +99,8 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - def __init__(self, base_estimator=None, method='sigmoid', cv=None): + @_deprecate_positional_args + def __init__(self, base_estimator=None, *, method='sigmoid', cv=None): self.base_estimator = base_estimator self.method = method self.cv = cv @@ -275,7 +277,8 @@ class _CalibratedClassifier: .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - def __init__(self, base_estimator, method='sigmoid', classes=None): + @_deprecate_positional_args + def __init__(self, base_estimator, *, method='sigmoid', classes=None): self.base_estimator = base_estimator self.method = method self.classes = classes diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 8ac6ce3e27eca..7fb11c1033981 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -3,6 +3,7 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod +import warnings import numpy as np @@ -88,7 +89,7 @@ class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta): @abstractmethod def __init__(self, n_clusters=3, svd_method="randomized", n_svd_vecs=None, mini_batch=False, init="k-means++", - n_init=10, n_jobs=None, random_state=None): + n_init=10, n_jobs='deprecated', random_state=None): self.n_clusters = n_clusters self.svd_method = svd_method self.n_svd_vecs = n_svd_vecs @@ -115,6 +116,10 @@ def fit(self, X, y=None): y : Ignored """ + if self.n_jobs != 'deprecated': + warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" + " removed in 0.25.", FutureWarning) + X = check_array(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) @@ -233,6 +238,10 @@ class SpectralCoclustering(BaseSpectral): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. deprecated:: 0.23 + ``n_jobs`` was deprecated in version 0.23 and will be removed in + 0.25. + random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. @@ -277,7 +286,7 @@ class SpectralCoclustering(BaseSpectral): """ def __init__(self, n_clusters=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', - n_init=10, n_jobs=None, random_state=None): + n_init=10, n_jobs='deprecated', random_state=None): super().__init__(n_clusters, svd_method, n_svd_vecs, @@ -380,6 +389,10 @@ class SpectralBiclustering(BaseSpectral): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. deprecated:: 0.23 + ``n_jobs`` was deprecated in version 0.23 and will be removed in + 0.25. + random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. @@ -425,7 +438,7 @@ class SpectralBiclustering(BaseSpectral): def __init__(self, n_clusters=3, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', - n_init=10, n_jobs=None, random_state=None): + n_init=10, n_jobs='deprecated', random_state=None): super().__init__(n_clusters, svd_method, n_svd_vecs, diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index e2f357a32aef4..e95c8fe0490a4 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -1,38 +1,42 @@ -# cython: cdivision=True -# cython: boundscheck=False -# cython: wraparound=False +# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True # # Author: Andreas Mueller # # Licence: BSD 3 clause +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + import numpy as np cimport numpy as np cimport cython from cython cimport floating - +from cython.parallel import prange, parallel from libc.math cimport sqrt +from libc.stdlib cimport calloc, free +from libc.string cimport memset, memcpy -from ..metrics import euclidean_distances -from ._k_means_fast import _centers_dense +from ..utils.extmath import row_norms +from ._k_means_fast cimport _relocate_empty_clusters_dense +from ._k_means_fast cimport _relocate_empty_clusters_sparse +from ._k_means_fast cimport _euclidean_dense_dense +from ._k_means_fast cimport _euclidean_sparse_dense +from ._k_means_fast cimport _average_centers +from ._k_means_fast cimport _center_shift -cdef floating euclidean_dist(floating* a, floating* b, int n_features) nogil: - cdef floating result, tmp - result = 0 - cdef int i - for i in range(n_features): - tmp = (a[i] - b[i]) - result += tmp * tmp - return sqrt(result) +np.import_array() -cdef update_labels_distances_inplace( - floating* X, floating* centers, floating[:, :] center_half_distances, - int[:] labels, floating[:, :] lower_bounds, floating[:] upper_bounds, - Py_ssize_t n_samples, int n_features, int n_clusters): - """ - Calculate upper and lower bounds for each sample. +def _init_bounds_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[:, ::1] centers, # IN + floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds): # OUT + """Initialize upper and lower bounds for each sample for dense input data. Given X, centers and the pairwise distances divided by 2.0 between the centers this calculates the upper bounds and lower bounds for each sample. @@ -49,212 +53,586 @@ cdef update_labels_distances_inplace( Parameters ---------- - X : nd-array, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features), dtype=floating The input data. - centers : nd-array, shape (n_clusters, n_features) + centers : ndarray of shape (n_clusters, n_features), dtype=floating The cluster centers. - center_half_distances : nd-array, shape (n_clusters, n_clusters) + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating The half of the distance between any 2 clusters centers. - labels : nd-array, shape(n_samples) + labels : ndarray of shape(n_samples), dtype=int The label for each sample. This array is modified in place. - lower_bounds : nd-array, shape(n_samples, n_clusters) - The lower bound on the distance between a sample and each cluster - center. It is modified in place. + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. + + lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + int n_features = X.shape[1] + + floating min_dist, dist + int best_cluster, i, j + + for i in range(n_samples): + best_cluster = 0 + min_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[0, 0], + n_features, False) + lower_bounds[i, 0] = min_dist + for j in range(1, n_clusters): + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def _init_bounds_sparse( + X, # IN + floating[:, ::1] centers, # IN + floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds): # OUT + """Initialize upper and lower bounds for each sample for sparse input data. + + Given X, centers and the pairwise distances divided by 2.0 between the + centers this calculates the upper bounds and lower bounds for each sample. + The upper bound for each sample is set to the distance between the sample + and the closest center. + + The lower bound for each sample is a one-dimensional array of n_clusters. + For each sample i assume that the previously assigned cluster is c1 and the + previous closest distance is dist, for a new cluster c2, the + lower_bound[i][c2] is set to distance between the sample and this new + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents + computation of unnecessary distances for each sample to the clusters that + it is unlikely to be assigned to. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The input data. Must be in CSR format. - upper_bounds : nd-array, shape(n_samples,) - The distance of each sample from its closest cluster center. This is - modified in place by the function. + centers : ndarray of shape (n_clusters, n_features), dtype=floating + The cluster centers. - n_samples : Py_ssize_t - The number of samples. + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + The half of the distance between any 2 clusters centers. + + labels : ndarray of shape(n_samples), dtype=int + The label for each sample. This array is modified in place. - n_features : int - The number of features. + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. - n_clusters : int - The number of clusters. + lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. """ - # assigns closest center to X - # uses triangle inequality - cdef floating* x - cdef floating* c - cdef floating d_c, dist - cdef int c_x, j - cdef Py_ssize_t sample - for sample in range(n_samples): - # assign first cluster center - c_x = 0 - x = X + sample * n_features - d_c = euclidean_dist(x, centers, n_features) - lower_bounds[sample, 0] = d_c + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + int n_features = X.shape[1] + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating min_dist, dist + int best_cluster, i, j + + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in range(n_samples): + best_cluster = 0 + min_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[0], centers_squared_norms[0], False) + + lower_bounds[i, 0] = min_dist for j in range(1, n_clusters): - if d_c > center_half_distances[c_x, j]: - c = centers + j * n_features - dist = euclidean_dist(x, c, n_features) - lower_bounds[sample, j] = dist - if dist < d_c: - d_c = dist - c_x = j - labels[sample] = c_x - upper_bounds[sample] = d_c - - -def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, - np.ndarray[floating, ndim=1, mode='c'] sample_weight, - int n_clusters, - np.ndarray[floating, ndim=2, mode='c'] init, - float tol=1e-4, int max_iter=30, verbose=False): - """Run Elkan's k-means. + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def _elkan_iter_chunked_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + int[::1] labels, # INOUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means Elkan algorithm with dense input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. Parameters ---------- - X_ : nd-array, shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. - sample_weight : nd-array, shape (n_samples,) + sample_weight : ndarray of shape (n_samples,), dtype=floating The weights for each observation in X. - n_clusters : int - Number of clusters to find. + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + Half pairwise distances between centers. + + distance_next_center : ndarray of shape (n_clusters,), dtype=floating + Distance between each center its closest center. + + upper_bounds : ndarray of shape (n_samples,), dtype=floating + Upper bound for the distance between each sample and its center, + updated inplace. - init : nd-array, shape (n_clusters, n_features) - Initial position of centers. + lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating + Lower bound for the distance between each sample and each center, + updated inplace. - tol : float, default=1e-4 - The relative increment in cluster means before declaring convergence. + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. - max_iter : int, default=30 - Maximum number of iterations of the k-means algorithm. + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. - verbose : bool, default=False - Whether to be verbose. + n_threads : int + The number of threads to be used by openmp. + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. """ - if floating is float: - dtype = np.float32 - else: - dtype = np.float64 - - # initialize - cdef np.ndarray[floating, ndim=2, mode='c'] centers_ = init - cdef floating* centers_p = centers_.data - cdef floating* X_p = X_.data - cdef floating* x_p - cdef Py_ssize_t n_samples = X_.shape[0] - cdef Py_ssize_t n_features = X_.shape[1] - cdef Py_ssize_t point_index - cdef int center_index, label - cdef floating upper_bound, distance - cdef floating[:, :] center_half_distances = euclidean_distances(centers_) / 2. - cdef floating[:, :] lower_bounds = np.zeros((n_samples, n_clusters), dtype=dtype) - cdef floating[:] distance_next_center - labels_ = np.empty(n_samples, dtype=np.int32) - cdef int[:] labels = labels_ - upper_bounds_ = np.empty(n_samples, dtype=dtype) - cdef floating[:] upper_bounds = upper_bounds_ - - # Get the initial set of upper bounds and lower bounds for each sample. - update_labels_distances_inplace(X_p, centers_p, center_half_distances, - labels, lower_bounds, upper_bounds, - n_samples, n_features, n_clusters) - cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8) - cdef np.ndarray[floating, ndim=2, mode='c'] new_centers - - if max_iter <= 0: - raise ValueError('Number of iterations should be a positive number' - ', got %d instead' % max_iter) - - for iteration in range(max_iter): - if verbose: - print("start iteration") - - cd = np.asarray(center_half_distances) - distance_next_center = np.partition(cd, kth=1, axis=0)[1] - - if verbose: - print("done sorting") - - for point_index in range(n_samples): - upper_bound = upper_bounds[point_index] - label = labels[point_index] - - # This means that the next likely center is far away from the - # currently assigned center and the sample is unlikely to be - # reassigned. - if distance_next_center[label] >= upper_bound: - continue - x_p = X_p + point_index * n_features - - # TODO: get pointer to lower_bounds[point_index, center_index] - for center_index in range(n_clusters): + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + # hard-coded number of samples per chunk. Splitting in chunks is + # necessary to get parallelism. Chunk size chosed to be same as lloyd's + int n_samples_chunk = 256 if n_samples > 256 else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx, n_samples_chunk_eff + int start, end + + int i, j, k + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_dense( + &X[start, 0], + sample_weight[start: end], + centers_old, + center_half_distances, + distance_next_center, + labels[start: end], + upper_bounds[start: end], + lower_bounds[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. The gil is necessary for that to avoid + # race conditions. + if update_centers: + with gil: + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + + if update_centers: + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + # update lower and upper bounds + for i in range(n_samples): + upper_bounds[i] += center_shift[labels[i]] + + for j in range(n_clusters): + lower_bounds[i, j] -= center_shift[j] + if lower_bounds[i, j] < 0: + lower_bounds[i, j] = 0 + + +cdef void _update_chunk_dense( + floating *X, # IN + # expecting C alinged 2D array. XXX: Can be + # replaced by const memoryview when cython min + # version is >= 0.3 + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): # If this holds, then center_index is a good candidate for the # sample to be relabelled, and we need to confirm this by # recomputing the upper and lower bounds. - if (center_index != label - and (upper_bound > lower_bounds[point_index, center_index]) - and (upper_bound > center_half_distances[center_index, label])): - - # Recompute the upper bound by calculating the actual distance - # between the sample and label. - if not bounds_tight[point_index]: - upper_bound = euclidean_dist(x_p, centers_p + label * n_features, n_features) - lower_bounds[point_index, label] = upper_bound - bounds_tight[point_index] = 1 - - # If the condition still holds, then compute the actual distance between - # the sample and center_index. If this is still lesser than the previous - # distance, reassign labels. - if (upper_bound > lower_bounds[point_index, center_index] - or (upper_bound > center_half_distances[label, center_index])): - distance = euclidean_dist(x_p, centers_p + center_index * n_features, n_features) - lower_bounds[point_index, center_index] = distance + if (j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j])): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_dense_dense( + X + i * n_features, ¢ers_old[label, 0], n_features, False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if (upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j])): + + distance = _euclidean_dense_dense( + X + i * n_features, ¢ers_old[j, 0], n_features, False) + lower_bounds[i, j] = distance if distance < upper_bound: - label = center_index + label = j upper_bound = distance - labels[point_index] = label - upper_bounds[point_index] = upper_bound - - if verbose: - print("end inner loop") - - # compute new centers - new_centers = _centers_dense(X_, sample_weight, labels_, - n_clusters, upper_bounds_) - bounds_tight[:] = 0 - - # compute distance each center moved - center_shift = np.sqrt(np.sum((centers_ - new_centers) ** 2, axis=1)) - - # update bounds accordingly - lower_bounds = np.maximum(lower_bounds - center_shift, 0) - upper_bounds = upper_bounds + center_shift[labels_] - - # reassign centers - centers_ = new_centers - centers_p = new_centers.data - - # update between-center distances - center_half_distances = euclidean_distances(centers_) / 2. - if verbose: - print('Iteration %i, inertia %s' - % (iteration, np.sum((X_ - centers_[labels]) ** 2 * - sample_weight[:,np.newaxis]))) - center_shift_total = np.sum(center_shift ** 2) - if center_shift_total <= tol: - if verbose: - print("center shift %e within tolerance %e" - % (center_shift_total, tol)) - break - - # We need this to make sure that the labels give the same output as - # predict(X) - if center_shift_total > 0: - update_labels_distances_inplace(X_p, centers_p, center_half_distances, - labels, lower_bounds, upper_bounds, - n_samples, n_features, n_clusters) - return centers_, labels_, iteration + 1 + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i] + + +def _elkan_iter_chunked_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + int[::1] labels, # INOUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means Elkan algorithm with sparse input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + Half pairwise distances between centers. + + distance_next_center : ndarray of shape (n_clusters,), dtype=floating + Distance between each center its closest center. + + upper_bounds : ndarray of shape (n_samples,), dtype=floating + Upper bound for the distance between each sample and its center, + updated inplace. + + lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating + Lower bound for the distance between each sample and each center, + updated inplace. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + # hard-coded number of samples per chunk. Splitting in chunks is + # necessary to get parallelism. Chunk size chosed to be same as lloyd's + int n_samples_chunk = 256 if n_samples > 256 else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx, n_samples_chunk_eff + int start, end + + int i, j, k + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_sparse( + X_data[X_indptr[start]: X_indptr[end]], + X_indices[X_indptr[start]: X_indptr[end]], + X_indptr[start: end], + sample_weight[start: end], + centers_old, + centers_squared_norms, + center_half_distances, + distance_next_center, + labels[start: end], + upper_bounds[start: end], + lower_bounds[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. The gil is necessary for that to avoid + # race conditions. + if update_centers: + with gil: + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + + if update_centers: + _relocate_empty_clusters_sparse( + X_data, X_indices, X_indptr, sample_weight, + centers_old, centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + # update lower and upper bounds + for i in range(n_samples): + upper_bounds[i] += center_shift[labels[i]] + + for j in range(n_clusters): + lower_bounds[i, j] -= center_shift[j] + if lower_bounds[i, j] < 0: + lower_bounds[i, j] = 0 + + +cdef void _update_chunk_sparse( + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + floating[:, ::1] center_half_distances, # IN + floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + int s = X_indptr[0] + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): + + # If this holds, then center_index is a good candidate for the + # sample to be relabelled, and we need to confirm this by + # recomputing the upper and lower bounds. + if (j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j])): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[label], centers_squared_norms[label], False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if (upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j])): + distance = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[j], centers_squared_norms[j], False) + lower_bounds[i, j] = distance + if distance < upper_bound: + label = j + upper_bound = distance + + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/sklearn/cluster/_k_means_fast.pxd b/sklearn/cluster/_k_means_fast.pxd new file mode 100644 index 0000000000000..b8dcd947f92c6 --- /dev/null +++ b/sklearn/cluster/_k_means_fast.pxd @@ -0,0 +1,23 @@ +# cython: language_level=3 + + +from cython cimport floating +cimport numpy as np + + +cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil + +cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1], + floating, bint) nogil + +cpdef void _relocate_empty_clusters_dense( + np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1], + floating[:, ::1], floating[::1], int[::1]) + +cpdef void _relocate_empty_clusters_sparse( + floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1], + floating[:, ::1], floating[::1], int[::1]) + +cdef void _average_centers(floating[:, ::1], floating[::1]) + +cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1]) diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_fast.pyx index 8a66f25065126..8221b2b15e356 100644 --- a/sklearn/cluster/_k_means_fast.pyx +++ b/sklearn/cluster/_k_means_fast.pyx @@ -1,4 +1,4 @@ -# cython: profile=True +# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True # Profiling is enabled by default as the overhead does not seem to be # measurable on this specific use case. @@ -7,155 +7,286 @@ # Lars Buitinck # # License: BSD 3 clause -# -# cython: boundscheck=False, wraparound=False, cdivision=True -from libc.math cimport sqrt +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + import numpy as np -import scipy.sparse as sp cimport numpy as np cimport cython from cython cimport floating +from libc.math cimport sqrt + +from ..utils.extmath import row_norms + + +np.import_array() -from ..utils.sparsefuncs_fast import assign_rows_csr -from ..utils._cython_blas cimport _dot ctypedef np.float64_t DOUBLE ctypedef np.int32_t INT -np.import_array() +cdef floating _euclidean_dense_dense( + floating* a, # IN + floating* b, # IN + int n_features, + bint squared) nogil: + """Euclidean distance between a dense and b dense""" + cdef: + int i + int n = n_features // 4 + int rem = n_features % 4 + floating result = 0 + + # We manually unroll the loop for better cache optimization. + for i in range(n): + result += ((a[0] - b[0]) * (a[0] - b[0]) + +(a[1] - b[1]) * (a[1] - b[1]) + +(a[2] - b[2]) * (a[2] - b[2]) + +(a[3] - b[3]) * (a[3] - b[3])) + a += 4; b += 4 + + for i in range(rem): + result += (a[i] - b[i]) * (a[i] - b[i]) + + return result if squared else sqrt(result) + + +def _euclidean_dense_dense_wrapper(floating[::1] a, floating[::1] b, + bint squared): + """Wrapper of _euclidean_dense_dense for testing purpose""" + return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared) + + +cdef floating _euclidean_sparse_dense( + floating[::1] a_data, # IN + int[::1] a_indices, # IN + floating[::1] b, # IN + floating b_squared_norm, + bint squared) nogil: + """Euclidean distance between a sparse and b dense""" + cdef: + int nnz = a_indices.shape[0] + int i + floating tmp, bi + floating result = 0.0 + + for i in range(nnz): + bi = b[a_indices[i]] + tmp = a_data[i] - bi + result += tmp * tmp - bi * bi + + result += b_squared_norm + + if result < 0: result = 0.0 + + return result if squared else sqrt(result) -cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X, - np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[floating, ndim=1] x_squared_norms, - np.ndarray[floating, ndim=2] centers, - np.ndarray[INT, ndim=1] labels, - np.ndarray[floating, ndim=1] distances): - """Compute label assignment and inertia for a dense array +def _euclidean_sparse_dense_wrapper( + floating[::1] a_data, + int[::1] a_indices, + floating[::1] b, + floating b_squared_norm, + bint squared): + """Wrapper of _euclidean_sparse_dense for testing purpose""" + return _euclidean_sparse_dense( + a_data, a_indices, b, b_squared_norm, squared) - Return the inertia (sum of squared distances to the centers). + +cpdef floating _inertia_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers, # IN + int[::1] labels): # IN + """Compute inertia for dense input data + + Sum of squared distance between each sample and its assigned center. """ cdef: - unsigned int n_clusters = centers.shape[0] - unsigned int n_features = centers.shape[1] - unsigned int n_samples = X.shape[0] - unsigned int x_stride - unsigned int center_stride - unsigned int sample_idx, center_idx, feature_idx - unsigned int store_distances = 0 - unsigned int k - np.ndarray[floating, ndim=1] center_squared_norms - # the following variables are always double cause make them floating - # does not save any memory, but makes the code much bigger - DOUBLE inertia = 0.0 - DOUBLE min_dist - DOUBLE dist - - if floating is float: - center_squared_norms = np.zeros(n_clusters, dtype=np.float32) - x_stride = X.strides[1] / sizeof(float) - center_stride = centers.strides[1] / sizeof(float) - else: - center_squared_norms = np.zeros(n_clusters, dtype=np.float64) - x_stride = X.strides[1] / sizeof(DOUBLE) - center_stride = centers.strides[1] / sizeof(DOUBLE) - - if n_samples == distances.shape[0]: - store_distances = 1 + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j - for center_idx in range(n_clusters): - center_squared_norms[center_idx] = _dot( - n_features, ¢ers[center_idx, 0], center_stride, - ¢ers[center_idx, 0], center_stride) - - for sample_idx in range(n_samples): - min_dist = -1 - for center_idx in range(n_clusters): - dist = 0.0 - # hardcoded: minimize euclidean distance to cluster center: - # ||a - b||^2 = ||a||^2 + ||b||^2 -2 - dist += _dot(n_features, &X[sample_idx, 0], x_stride, - ¢ers[center_idx, 0], center_stride) - dist *= -2 - dist += center_squared_norms[center_idx] - dist += x_squared_norms[sample_idx] - dist *= sample_weight[sample_idx] - if min_dist == -1 or dist < min_dist: - min_dist = dist - labels[sample_idx] = center_idx - - if store_distances: - distances[sample_idx] = min_dist - inertia += min_dist + floating sq_dist = 0.0 + floating inertia = 0.0 + + for i in range(n_samples): + j = labels[i] + sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, True) + inertia += sq_dist * sample_weight[i] return inertia -cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[DOUBLE, ndim=1] x_squared_norms, - np.ndarray[floating, ndim=2] centers, - np.ndarray[INT, ndim=1] labels, - np.ndarray[floating, ndim=1] distances): - """Compute label assignment and inertia for a CSR input +cpdef floating _inertia_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers, # IN + int[::1] labels): # IN + """Compute inertia for sparse input data - Return the inertia (sum of squared distances to the centers). + Sum of squared distance between each sample and its assigned center. """ cdef: - np.ndarray[floating, ndim=1] X_data = X.data - np.ndarray[INT, ndim=1] X_indices = X.indices - np.ndarray[INT, ndim=1] X_indptr = X.indptr - unsigned int n_clusters = centers.shape[0] - unsigned int n_features = centers.shape[1] - unsigned int n_samples = X.shape[0] - unsigned int store_distances = 0 - unsigned int sample_idx, center_idx, feature_idx - unsigned int k - np.ndarray[floating, ndim=1] center_squared_norms - # the following variables are always double cause make them floating - # does not save any memory, but makes the code much bigger - DOUBLE inertia = 0.0 - DOUBLE min_dist - DOUBLE dist + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr - if floating is float: - center_squared_norms = np.zeros(n_clusters, dtype=np.float32) - else: - center_squared_norms = np.zeros(n_clusters, dtype=np.float64) + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j - if n_samples == distances.shape[0]: - store_distances = 1 + floating sq_dist = 0.0 + floating inertia = 0.0 - for center_idx in range(n_clusters): - center_squared_norms[center_idx] = _dot( - n_features, ¢ers[center_idx, 0], 1, - ¢ers[center_idx, 0], 1) - - for sample_idx in range(n_samples): - min_dist = -1 - for center_idx in range(n_clusters): - dist = 0.0 - # hardcoded: minimize euclidean distance to cluster center: - # ||a - b||^2 = ||a||^2 + ||b||^2 -2 - for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): - dist += centers[center_idx, X_indices[k]] * X_data[k] - dist *= -2 - dist += center_squared_norms[center_idx] - dist += x_squared_norms[sample_idx] - dist *= sample_weight[sample_idx] - if min_dist == -1 or dist < min_dist: - min_dist = dist - labels[sample_idx] = center_idx - if store_distances: - distances[sample_idx] = dist - inertia += min_dist + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in range(n_samples): + j = labels[i] + sq_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], True) + inertia += sq_dist * sample_weight[i] return inertia +cpdef void _relocate_empty_clusters_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # INOUT + floating[::1] weight_in_clusters, # INOUT + int[::1] labels): # IN + """Relocate centers which have no sample assigned to them.""" + cdef: + int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32) + int n_empty = empty_clusters.shape[0] + + if n_empty == 0: + return + + cdef: + int n_features = X.shape[1] + + floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1) + int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32) + + int new_cluster_id, old_cluster_id, far_idx, idx, k + floating weight + + for idx in range(n_empty): + + new_cluster_id = empty_clusters[idx] + + far_idx = far_from_centers[idx] + weight = sample_weight[far_idx] + + old_cluster_id = labels[far_idx] + + for k in range(n_features): + centers_new[old_cluster_id, k] -= X[far_idx, k] * weight + centers_new[new_cluster_id, k] = X[far_idx, k] * weight + + weight_in_clusters[new_cluster_id] = weight + weight_in_clusters[old_cluster_id] -= weight + + +cpdef void _relocate_empty_clusters_sparse( + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # INOUT + floating[::1] weight_in_clusters, # INOUT + int[::1] labels): # IN + """Relocate centers which have no sample assigned to them.""" + cdef: + int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32) + int n_empty = empty_clusters.shape[0] + + if n_empty == 0: + return + + cdef: + int n_samples = X_indptr.shape[0] - 1 + int n_features = centers_old.shape[1] + floating x + int i, j, k + + floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype) + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + for i in range(n_samples): + j = labels[i] + distances[i] = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers_old[j], centers_squared_norms[j], True) + + cdef: + int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32) + + int new_cluster_id, old_cluster_id, far_idx, idx + floating weight + + for idx in range(n_empty): + + new_cluster_id = empty_clusters[idx] + + far_idx = far_from_centers[idx] + weight = sample_weight[far_idx] + + old_cluster_id = labels[far_idx] + + for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]): + centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight + centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight + + weight_in_clusters[new_cluster_id] = weight + weight_in_clusters[old_cluster_id] -= weight + + +cdef void _average_centers( + floating[:, ::1] centers, # INOUT + floating[::1] weight_in_clusters): # IN + """Average new centers wrt weights.""" + cdef: + int n_clusters = centers.shape[0] + int n_features = centers.shape[1] + int j, k + floating alpha + + for j in range(n_clusters): + if weight_in_clusters[j] > 0: + alpha = 1.0 / weight_in_clusters[j] + for k in range(n_features): + centers[j, k] *= alpha + + +cdef void _center_shift( + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # IN + floating[::1] center_shift): # OUT + """Compute shift between old and new centers.""" + cdef: + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + int j + + for j in range(n_clusters): + center_shift[j] = _euclidean_dense_dense( + ¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False) + + def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[DOUBLE, ndim=1] x_squared_norms, + np.ndarray[floating, ndim=1] x_squared_norms, np.ndarray[floating, ndim=2] centers, np.ndarray[floating, ndim=1] weight_sums, np.ndarray[INT, ndim=1] nearest_center, @@ -253,143 +384,3 @@ def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight, - centers[center_idx, feature_idx]) ** 2 return squared_diff - - -def _centers_dense(np.ndarray[floating, ndim=2] X, - np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[INT, ndim=1] labels, int n_clusters, - np.ndarray[floating, ndim=1] distances): - """M step of the K-means EM algorithm - - Computation of cluster centers / means. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - - sample_weight : array-like, shape (n_samples,) - The weights for each observation in X. - - labels : array of integers, shape (n_samples) - Current label assignment - - n_clusters : int - Number of desired clusters - - distances : array-like, shape (n_samples) - Distance to closest cluster for each sample. - - Returns - ------- - centers : array, shape (n_clusters, n_features) - The resulting centers - """ - ## TODO: add support for CSR input - cdef int n_samples, n_features - n_samples = X.shape[0] - n_features = X.shape[1] - cdef int i, j, c - cdef np.ndarray[floating, ndim=2] centers - cdef np.ndarray[floating, ndim=1] weight_in_cluster - - dtype = np.float32 if floating is float else np.float64 - centers = np.zeros((n_clusters, n_features), dtype=dtype) - weight_in_cluster = np.zeros((n_clusters,), dtype=dtype) - - for i in range(n_samples): - c = labels[i] - weight_in_cluster[c] += sample_weight[i] - empty_clusters = np.where(weight_in_cluster == 0)[0] - # maybe also relocate small clusters? - - if len(empty_clusters): - # find points to reassign empty clusters to - far_from_centers = distances.argsort()[::-1] - - for i, cluster_id in enumerate(empty_clusters): - # XXX two relocated clusters could be close to each other - far_index = far_from_centers[i] - new_center = X[far_index] * sample_weight[far_index] - centers[cluster_id] = new_center - weight_in_cluster[cluster_id] = sample_weight[far_index] - - for i in range(n_samples): - for j in range(n_features): - centers[labels[i], j] += X[i, j] * sample_weight[i] - - centers /= weight_in_cluster[:, np.newaxis] - - return centers - - -def _centers_sparse(X, np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[INT, ndim=1] labels, n_clusters, - np.ndarray[floating, ndim=1] distances): - """M step of the K-means EM algorithm - - Computation of cluster centers / means. - - Parameters - ---------- - X : scipy.sparse.csr_matrix, shape (n_samples, n_features) - - sample_weight : array-like, shape (n_samples,) - The weights for each observation in X. - - labels : array of integers, shape (n_samples) - Current label assignment - - n_clusters : int - Number of desired clusters - - distances : array-like, shape (n_samples) - Distance to closest cluster for each sample. - - Returns - ------- - centers : array, shape (n_clusters, n_features) - The resulting centers - """ - cdef int n_samples, n_features - n_samples = X.shape[0] - n_features = X.shape[1] - cdef int curr_label - - cdef np.ndarray[floating, ndim=1] data = X.data - cdef np.ndarray[int, ndim=1] indices = X.indices - cdef np.ndarray[int, ndim=1] indptr = X.indptr - - cdef np.ndarray[floating, ndim=2, mode="c"] centers - cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers - cdef np.ndarray[floating, ndim=1] weight_in_cluster - dtype = np.float32 if floating is float else np.float64 - centers = np.zeros((n_clusters, n_features), dtype=dtype) - weight_in_cluster = np.zeros((n_clusters,), dtype=dtype) - for i in range(n_samples): - c = labels[i] - weight_in_cluster[c] += sample_weight[i] - cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \ - np.where(weight_in_cluster == 0)[0] - cdef int n_empty_clusters = empty_clusters.shape[0] - - # maybe also relocate small clusters? - - if n_empty_clusters > 0: - # find points to reassign empty clusters to - far_from_centers = distances.argsort()[::-1][:n_empty_clusters] - - # XXX two relocated clusters could be close to each other - assign_rows_csr(X, far_from_centers, empty_clusters, centers) - - for i in range(n_empty_clusters): - weight_in_cluster[empty_clusters[i]] = 1 - - for i in range(labels.shape[0]): - curr_label = labels[i] - for ind in range(indptr[i], indptr[i + 1]): - j = indices[ind] - centers[curr_label, j] += data[ind] * sample_weight[i] - - centers /= weight_in_cluster[:, np.newaxis] - - return centers diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx new file mode 100644 index 0000000000000..93e2c6f0b9c89 --- /dev/null +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -0,0 +1,407 @@ +# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True +# +# Licence: BSD 3 clause + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This is fixed in cython > 0.3. + +import numpy as np +cimport numpy as np +from cython cimport floating +from cython.parallel import prange, parallel +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport memset, memcpy +from libc.float cimport DBL_MAX, FLT_MAX + +from ..utils.extmath import row_norms +from ..utils._cython_blas cimport _gemm +from ..utils._cython_blas cimport RowMajor, Trans, NoTrans +from ._k_means_fast cimport _relocate_empty_clusters_dense +from ._k_means_fast cimport _relocate_empty_clusters_sparse +from ._k_means_fast cimport _average_centers, _center_shift + + +np.import_array() + + +def _lloyd_iter_chunked_dense( + np.ndarray[floating, ndim=2, mode='c'] X, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + int[::1] labels, # OUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means lloyd algorithm with dense input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + x_squared_norms : ndarray of shape (n_samples,), dtype=floating + Squared L2 norm of X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + centers_squared_norms : ndarray of shape (n_clusters,), dtype=floating + Squared L2 norm of the centers. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + # hard-coded number of samples per chunk. Appeared to be close to + # optimal in all situations. + int n_samples_chunk = 256 if n_samples > 256 else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx, n_samples_chunk_eff + int start, end + + int j, k + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + floating *pairwise_distances_chunk + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + pairwise_distances_chunk = malloc(n_samples_chunk * n_clusters * sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_dense( + &X[start, 0], + sample_weight[start: end], + x_squared_norms[start: end], + centers_old, + centers_squared_norms, + labels[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + pairwise_distances_chunk, + update_centers) + + # reduction from local buffers. The gil is necessary for that to avoid + # race conditions. + if update_centers: + with gil: + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + free(pairwise_distances_chunk) + + if update_centers: + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + +cdef void _update_chunk_dense( + floating *X, # IN + # expecting C alinged 2D array. XXX: Can be + # replaced by const memoryview when cython min + # version is >= 0.3 + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + floating *pairwise_distances, # OUT + bint update_centers) nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + + # Instead of computing the full pairwise squared distances matrix, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store + # the - 2 X.C^T + ||C||² term since the argmin for a given sample only + # depends on the centers. + # pairwise_distances = ||C||² + for i in range(n_samples): + for j in range(n_clusters): + pairwise_distances[i * n_clusters + j] = centers_squared_norms[j] + + # pairwise_distances += -2 * X.dot(C.T) + _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features, + -2.0, X, n_features, ¢ers_old[0, 0], n_features, + 1.0, pairwise_distances, n_clusters) + + for i in range(n_samples): + min_sq_dist = pairwise_distances[i * n_clusters] + label = 0 + for j in range(1, n_clusters): + sq_dist = pairwise_distances[i * n_clusters + j] + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i] + + +def _lloyd_iter_chunked_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + int[::1] labels, # OUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means lloyd algorithm with sparse input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + x_squared_norms : ndarray of shape (n_samples,), dtype=floating + Squared L2 norm of X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + centers_squared_norms : ndarray of shape (n_clusters,), dtype=floating + Squared L2 norm of the centers. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + # print(X.indices.dtype) + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + # Chosed same as for dense. Does not have the same impact since with + # sparse data the pairwise distances matrix is not precomputed. + # However, splitting in chunks is necessary to get parallelism. + int n_samples_chunk = 256 if n_samples > 256 else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx, n_samples_chunk_eff = 0 + int start = 0, end = 0 + + int j, k + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_sparse( + X_data[X_indptr[start]: X_indptr[end]], + X_indices[X_indptr[start]: X_indptr[end]], + X_indptr[start: end], + sample_weight[start: end], + x_squared_norms[start: end], + centers_old, + centers_squared_norms, + labels[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. The gil is necessary for that to avoid + # race conditions. + if update_centers: + with gil: + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + + if update_centers: + _relocate_empty_clusters_sparse( + X_data, X_indices, X_indptr, sample_weight, + centers_old, centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + +cdef void _update_chunk_sparse( + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[::1] x_squared_norms, # IN + floating[:, ::1] centers_old, # IN + floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + floating max_floating = FLT_MAX if floating is float else DBL_MAX + int s = X_indptr[0] + + # XXX Precompute the pairwise distances matrix is not worth for sparse + # currently. Should be tested when BLAS (sparse x dense) matrix + # multiplication is available. + for i in range(n_samples): + min_sq_dist = max_floating + label = 0 + + for j in range(n_clusters): + sq_dist = 0.0 + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + sq_dist += centers_old[j, X_indices[k]] * X_data[k] + + # Instead of computing the full squared distance with each cluster, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute + # the - 2 X.C^T + ||C||² term since the argmin for a given sample + # only depends on the centers C. + sq_dist = centers_squared_norms[j] -2 * sq_dist + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7ca1db87e0035..7e4df5908137b 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -15,12 +15,11 @@ import numpy as np import scipy.sparse as sp -from joblib import Parallel, delayed, effective_n_jobs +from threadpoolctl import threadpool_limits from ..base import BaseEstimator, ClusterMixin, TransformerMixin from ..metrics.pairwise import euclidean_distances -from ..metrics.pairwise import pairwise_distances_argmin_min -from ..utils.extmath import row_norms, squared_norm, stable_cumsum +from ..utils.extmath import row_norms, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _num_samples @@ -28,10 +27,17 @@ from ..utils import gen_batches from ..utils import check_random_state from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import FLOAT_DTYPES +from ..utils._openmp_helpers import _openmp_effective_n_threads from ..exceptions import ConvergenceWarning -from . import _k_means_fast as _k_means -from ._k_means_elkan import k_means_elkan +from ._k_means_fast import _inertia_dense +from ._k_means_fast import _inertia_sparse +from ._k_means_fast import _mini_batch_update_csr +from ._k_means_lloyd import _lloyd_iter_chunked_dense +from ._k_means_lloyd import _lloyd_iter_chunked_sparse +from ._k_means_elkan import _init_bounds_dense +from ._k_means_elkan import _init_bounds_sparse +from ._k_means_elkan import _elkan_iter_chunked_dense +from ._k_means_elkan import _elkan_iter_chunked_sparse ############################################################################### @@ -43,21 +49,21 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): Parameters ---------- - X : array or sparse matrix, shape (n_samples, n_features) + X : {ndarray, sparse matrix} of shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data should be double precision (dtype=np.float64). - n_clusters : integer + n_clusters : int The number of seeds to choose - x_squared_norms : array, shape (n_samples,) + x_squared_norms : ndarray of shape (n_samples,) Squared Euclidean norm of each data point. random_state : RandomState instance The generator used to initialize the centers. See :term:`Glossary `. - n_local_trials : integer, optional + n_local_trials : int, default=None The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically @@ -152,6 +158,8 @@ def _validate_center_shape(X, n_centers, centers): def _tolerance(X, tol): """Return a tolerance which is independent of the dataset""" + if tol == 0: + return 0 if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] else: @@ -175,16 +183,16 @@ def _check_normalize_sample_weight(sample_weight, X): def k_means(X, n_clusters, sample_weight=None, init='k-means++', - precompute_distances='auto', n_init=10, max_iter=300, + precompute_distances='deprecated', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, - n_jobs=None, algorithm="auto", return_n_iter=False): + n_jobs='deprecated', algorithm="auto", return_n_iter=False): """K-means clustering algorithm. Read more in the :ref:`User Guide `. Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse} matrix of shape (n_samples, n_features) The observations to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. @@ -193,25 +201,25 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', The number of clusters to form as well as the number of centroids to generate. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None) + are assigned equal weight - init : {'k-means++', 'random', or ndarray, or a callable}, optional - Method for initialization, default to 'k-means++': + init : {'k-means++', 'random', ndarray, callable}, default='k-means++' + Method for initialization: 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. - 'random': choose k observations (rows) at random from data for - the initial centroids. + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. - If a callable is passed, it should take arguments X, k and - and a random state and return an initialization. + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. precompute_distances : {'auto', True, False} Precompute distances (faster but takes more memory). @@ -224,59 +232,73 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', False : never precompute distances - n_init : int, optional, default: 10 + .. deprecated:: 0.23 + 'precompute_distances' was deprecated in version 0.23 and will be + removed in 0.25. It has no effect. + + n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. - max_iter : int, optional, default 300 + max_iter : int, default=300 Maximum number of iterations of the k-means algorithm to run. - verbose : boolean, optional + verbose : bool, default=False Verbosity mode. - tol : float, optional + tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. - copy_x : bool, optional + copy_x : bool, default=True When pre-computing distances it is more numerically accurate to center - the data first. If copy_x is True (default), then the original data is - not modified, ensuring X is C-contiguous. If False, the original data - is modified, and put back before the function returns, but small - numerical differences may be introduced by subtracting and then adding - the data mean, in this case it will also not ensure that data is - C-contiguous which may cause a significant slowdown. - - n_jobs : int or None, optional (default=None) - The number of jobs to use for the computation. This works by computing - each of the n_init runs in parallel. - - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - algorithm : "auto", "full" or "elkan", default="auto" + the data first. If copy_x is True (default), then the original data is + not modified. If False, the original data is modified, and put back + before the function returns, but small numerical differences may be + introduced by subtracting and then adding the data mean. Note that if + the original data is not C-contiguous, a copy will be made even if + copy_x is False. If the original data is sparse, but not in CSR format, + a copy will be made even if copy_x is False. + + n_jobs : int, default=None + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. + + ``None`` or ``-1`` means using all processors. + + .. deprecated:: 0.23 + ``n_jobs`` was deprecated in version 0.23 and will be removed in + 0.25. + + algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". - The "elkan" variation is more efficient by using the triangle - inequality, but currently doesn't support sparse data. "auto" chooses - "elkan" for dense data and "full" for sparse data. + The "elkan" variation is more efficient on data with well-defined + clusters, by using the triangle inequality. However it's more memory + intensive due to the allocation of an extra array of shape + (n_samples, n_clusters). - return_n_iter : bool, optional + For now "auto" (kept for backward compatibiliy) chooses "elkan" but it + might change in the future for a better heuristic. + + return_n_iter : bool, default=False Whether or not to return the number of iterations. Returns ------- - centroid : float ndarray with shape (k, n_features) + centroid : ndarray of shape (n_clusters, n_features) Centroids found at the last iteration of k-means. - label : integer ndarray with shape (n_samples,) + label : ndarray of shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. @@ -288,7 +310,6 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ - est = KMeans( n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, verbose=verbose, precompute_distances=precompute_distances, tol=tol, @@ -303,93 +324,69 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, - random_state=None, tol=1e-4, - precompute_distances=True): - if sp.issparse(X): - raise TypeError("algorithm='elkan' not supported for sparse input X") - random_state = check_random_state(random_state) - if x_squared_norms is None: - x_squared_norms = row_norms(X, squared=True) - # init - centers = _init_centroids(X, n_clusters, init, random_state=random_state, - x_squared_norms=x_squared_norms) - centers = np.ascontiguousarray(centers) - if verbose: - print('Initialization complete') - - checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) - centers, labels, n_iter = k_means_elkan(X, checked_sample_weight, - n_clusters, centers, tol=tol, - max_iter=max_iter, verbose=verbose) - if sample_weight is None: - inertia = np.sum((X - centers[labels]) ** 2, dtype=np.float64) - else: - sq_distances = np.sum((X - centers[labels]) ** 2, axis=1, - dtype=np.float64) * checked_sample_weight - inertia = np.sum(sq_distances, dtype=np.float64) - return labels, inertia, centers, n_iter - - -def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, - init='k-means++', verbose=False, x_squared_norms=None, - random_state=None, tol=1e-4, - precompute_distances=True): - """A single run of k-means, assumes preparation completed prior. + random_state=None, tol=1e-4, n_threads=1): + """A single run of k-means lloyd, assumes preparation completed prior. Parameters ---------- - X : array-like of floats, shape (n_samples, n_features) - The observations to cluster. + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The observations to cluster. If sparse matrix, must be in CSR format. + + sample_weight : array-like of shape (n_samples,) + The weights for each observation in X. n_clusters : int The number of clusters to form as well as the number of centroids to generate. - sample_weight : array-like, shape (n_samples,) - The weights for each observation in X. - - max_iter : int, optional, default 300 + max_iter : int, default=300 Maximum number of iterations of the k-means algorithm to run. - init : {'k-means++', 'random', or ndarray, or a callable}, optional - Method for initialization, default to 'k-means++': + init : {'k-means++', 'random', ndarray, callable}, default='k-means++' + Method for initialization: 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. - 'random': choose k observations (rows) at random from data for - the initial centroids. - - If an ndarray is passed, it should be of shape (k, p) and gives - the initial centers. + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. - If a callable is passed, it should take arguments X, k and - and a random state and return an initialization. + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. - tol : float, optional - The relative increment in the results before declaring convergence. + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. - verbose : boolean, optional + verbose : bool, default=False Verbosity mode - x_squared_norms : array + x_squared_norms : array-like, default=None Precomputed x_squared_norms. - precompute_distances : boolean, default: True - Precompute distances (faster but takes more memory). - random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. + + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. + Returns ------- - centroid : float ndarray with shape (k, n_features) + centroid : ndarray of shape (n_clusters, n_features) Centroids found at the last iteration of k-means. - label : integer ndarray with shape (n_samples,) + label : ndarray of shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. @@ -401,197 +398,286 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, Number of iterations run. """ random_state = check_random_state(random_state) - sample_weight = _check_normalize_sample_weight(sample_weight, X) - best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) + if verbose: - print("Initialization complete") + print('Initialization complete') - # Allocate memory to store the distances for each sample to its - # closer center for reallocation in case of ties - distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) + n_samples = X.shape[0] + + centers_new = np.zeros_like(centers) + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + labels = np.full(n_samples, -1, dtype=np.int32) + center_half_distances = euclidean_distances(centers) / 2 + distance_next_center = np.partition(np.asarray(center_half_distances), + kth=1, axis=0)[1] + upper_bounds = np.zeros(n_samples, dtype=X.dtype) + lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + init_bounds = _init_bounds_sparse + elkan_iter = _elkan_iter_chunked_sparse + _inertia = _inertia_sparse + else: + init_bounds = _init_bounds_dense + elkan_iter = _elkan_iter_chunked_dense + _inertia = _inertia_dense + + init_bounds(X, centers, center_half_distances, + labels, upper_bounds, lower_bounds) - # iterations for i in range(max_iter): - centers_old = centers.copy() - # labels assignment is also called the E-step of EM - labels, inertia = \ - _labels_inertia(X, sample_weight, x_squared_norms, centers, - precompute_distances=precompute_distances, - distances=distances) - - # computation of the means is also called the M-step of EM - if sp.issparse(X): - centers = _k_means._centers_sparse(X, sample_weight, labels, - n_clusters, distances) - else: - centers = _k_means._centers_dense(X, sample_weight, labels, - n_clusters, distances) + elkan_iter(X, sample_weight, centers, centers_new, weight_in_clusters, + center_half_distances, distance_next_center, upper_bounds, + lower_bounds, labels, center_shift, n_threads) - if verbose: - print("Iteration %2d, inertia %.3f" % (i, inertia)) + # compute new pairwise distances between centers and closest other + # center of each center for next iterations + center_half_distances = euclidean_distances(centers_new) / 2 + distance_next_center = np.partition(np.asarray(center_half_distances), + kth=1, axis=0)[1] - if best_inertia is None or inertia < best_inertia: - best_labels = labels.copy() - best_centers = centers.copy() - best_inertia = inertia + if verbose: + inertia = _inertia(X, sample_weight, centers, labels) + print("Iteration {0}, inertia {1}" .format(i, inertia)) - center_shift_total = squared_norm(centers_old - centers) - if center_shift_total <= tol: + center_shift_tot = (center_shift**2).sum() + if center_shift_tot <= tol: if verbose: - print("Converged at iteration %d: " - "center shift %e within tolerance %e" - % (i, center_shift_total, tol)) + print("Converged at iteration {0}: " + "center shift {1} within tolerance {2}" + .format(i, center_shift_tot, tol)) break - if center_shift_total > 0: - # rerun E-step in case of non-convergence so that predicted labels - # match cluster centers - best_labels, best_inertia = \ - _labels_inertia(X, sample_weight, x_squared_norms, best_centers, - precompute_distances=precompute_distances, - distances=distances) + centers, centers_new = centers_new, centers - return best_labels, best_inertia, best_centers, i + 1 + if center_shift_tot > 0: + # rerun E-step so that predicted labels match cluster centers + elkan_iter(X, sample_weight, centers, centers, weight_in_clusters, + center_half_distances, distance_next_center, upper_bounds, + lower_bounds, labels, center_shift, n_threads, + update_centers=False) + inertia = _inertia(X, sample_weight, centers, labels) -def _labels_inertia_precompute_dense(X, sample_weight, x_squared_norms, - centers, distances): - """Compute labels and inertia using a full distance matrix. + return labels, inertia, centers, i + 1 - This will overwrite the 'distances' array in-place. + +def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, + init='k-means++', verbose=False, x_squared_norms=None, + random_state=None, tol=1e-4, n_threads=1): + """A single run of k-means lloyd, assumes preparation completed prior. Parameters ---------- - X : numpy array, shape (n_sample, n_features) - Input data. + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The observations to cluster. If sparse matrix, must be in CSR format. - sample_weight : array-like, shape (n_samples,) + sample_weight : ndarray of shape (n_samples,) The weights for each observation in X. - x_squared_norms : numpy array, shape (n_samples,) - Precomputed squared norms of X. + n_clusters : int + The number of clusters to form as well as the number of + centroids to generate. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm to run. + + init : {'k-means++', 'random', ndarray, callable}, default='k-means++' + Method for initialization: + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. + + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. + + verbose : bool, default=False + Verbosity mode + + x_squared_norms : ndarray of shape(n_samples,), default=None + Precomputed x_squared_norms. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. - centers : numpy array, shape (n_clusters, n_features) - Cluster centers which data is assigned to. + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. - distances : numpy array, shape (n_samples,) - Pre-allocated array in which distances are stored. + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. Returns ------- - labels : numpy array, dtype=np.int, shape (n_samples,) - Indices of clusters that samples are assigned to. + centroid : ndarray of shape (n_clusters, n_features) + Centroids found at the last iteration of k-means. + + label : ndarray of shape (n_samples,) + label[i] is the code or index of the centroid the + i'th observation is closest to. inertia : float - Sum of squared distances of samples to their closest cluster center. + The final value of the inertia criterion (sum of squared distances to + the closest centroid for all observations in the training set). + n_iter : int + Number of iterations run. """ - n_samples = X.shape[0] + random_state = check_random_state(random_state) + sample_weight = _check_normalize_sample_weight(sample_weight, X) - # Breakup nearest neighbor distance computation into batches to prevent - # memory blowup in the case of a large number of samples and clusters. - # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. - labels, mindist = pairwise_distances_argmin_min( - X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) - # cython k-means code assumes int32 inputs - labels = labels.astype(np.int32, copy=False) - if n_samples == distances.shape[0]: - # distances will be changed in-place - distances[:] = mindist - inertia = (mindist * sample_weight).sum() - return labels, inertia + # init + centers = _init_centroids(X, n_clusters, init, random_state=random_state, + x_squared_norms=x_squared_norms) + + if verbose: + print("Initialization complete") + + centers_new = np.zeros_like(centers) + labels = np.full(X.shape[0], -1, dtype=np.int32) + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + lloyd_iter = _lloyd_iter_chunked_sparse + _inertia = _inertia_sparse + else: + lloyd_iter = _lloyd_iter_chunked_dense + _inertia = _inertia_dense + + for i in range(max_iter): + lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new, + weight_in_clusters, labels, center_shift, n_threads) + + if verbose: + inertia = _inertia(X, sample_weight, centers, labels) + print("Iteration {0}, inertia {1}" .format(i, inertia)) + + center_shift_tot = (center_shift**2).sum() + if center_shift_tot <= tol: + if verbose: + print("Converged at iteration {0}: " + "center shift {1} within tolerance {2}" + .format(i, center_shift_tot, tol)) + break + centers, centers_new = centers_new, centers -def _labels_inertia(X, sample_weight, x_squared_norms, centers, - precompute_distances=True, distances=None): + if center_shift_tot > 0: + # rerun E-step so that predicted labels match cluster centers + lloyd_iter(X, sample_weight, x_squared_norms, centers, centers, + weight_in_clusters, labels, center_shift, n_threads, + update_centers=False) + + inertia = _inertia(X, sample_weight, centers, labels) + + return labels, inertia, centers, i + 1 + + +def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. - This will compute the distances in-place. Parameters ---------- - X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) - The input samples to assign to the labels. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples to assign to the labels. If sparse matrix, must be in + CSR format. - sample_weight : array-like, shape (n_samples,) + sample_weight : array-like of shape (n_samples,) The weights for each observation in X. - x_squared_norms : array, shape (n_samples,) + x_squared_norms : ndarray of shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. - centers : float array, shape (k, n_features) + centers : ndarray, shape (n_clusters, n_features) The cluster centers. - precompute_distances : boolean, default: True - Precompute distances (faster but takes more memory). - - distances : float array, shape (n_samples,) - Pre-allocated array to be filled in with each sample's distance - to the closest center. + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. Returns ------- - labels : int array of shape(n) + labels : ndarray of shape (n_samples,) The resulting assignment inertia : float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] + n_clusters = centers.shape[0] + sample_weight = _check_normalize_sample_weight(sample_weight, X) - # set the default value of centers to -1 to be able to detect any anomaly - # easily - labels = np.full(n_samples, -1, np.int32) - if distances is None: - distances = np.zeros(shape=(0,), dtype=X.dtype) - # distances will be changed in-place + labels = np.full(n_samples, -1, dtype=np.int32) + weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype) + center_shift = np.zeros_like(weight_in_clusters) + if sp.issparse(X): - inertia = _k_means._assign_labels_csr( - X, sample_weight, x_squared_norms, centers, labels, - distances=distances) + _labels = _lloyd_iter_chunked_sparse + _inertia = _inertia_sparse else: - if precompute_distances: - return _labels_inertia_precompute_dense(X, sample_weight, - x_squared_norms, centers, - distances) - inertia = _k_means._assign_labels_array( - X, sample_weight, x_squared_norms, centers, labels, - distances=distances) + _labels = _lloyd_iter_chunked_dense + _inertia = _inertia_dense + + _labels(X, sample_weight, x_squared_norms, centers, centers, + weight_in_clusters, labels, center_shift, n_threads, + update_centers=False) + + inertia = _inertia(X, sample_weight, centers, labels) + return labels, inertia -def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, - init_size=None): +def _init_centroids(X, n_clusters=8, init="k-means++", random_state=None, + x_squared_norms=None, init_size=None): """Compute the initial centroids Parameters ---------- - X : array, shape (n_samples, n_features) + X : {ndarray, spare matrix} of shape (n_samples, n_features) + The input samples. - k : int - number of centroids + n_clusters : int, default=8 + number of centroids. - init : {'k-means++', 'random' or ndarray or callable} optional - Method for initialization + init : {'k-means++', 'random', ndarray, callable}, default="k-means++" + Method for initialization. random_state : int, RandomState instance, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. - x_squared_norms : array, shape (n_samples,), optional + x_squared_norms : ndarray of shape (n_samples,), default=None Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None - init_size : int, optional + init_size : int, default=None Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accuracy): the only algorithm is initialized by running a batch KMeans on a @@ -599,7 +685,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, Returns ------- - centers : array, shape(k, n_features) + centers : array of shape(k, n_features) """ random_state = check_random_state(random_state) n_samples = X.shape[0] @@ -608,32 +694,33 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, x_squared_norms = row_norms(X, squared=True) if init_size is not None and init_size < n_samples: - if init_size < k: + if init_size < n_clusters: warnings.warn( "init_size=%d should be larger than k=%d. " - "Setting it to 3*k" % (init_size, k), + "Setting it to 3*k" % (init_size, n_clusters), RuntimeWarning, stacklevel=2) - init_size = 3 * k + init_size = 3 * n_clusters init_indices = random_state.randint(0, n_samples, init_size) X = X[init_indices] x_squared_norms = x_squared_norms[init_indices] n_samples = X.shape[0] - elif n_samples < k: + elif n_samples < n_clusters: raise ValueError( - "n_samples=%d should be larger than k=%d" % (n_samples, k)) + "n_samples={} should be larger than n_clusters={}" + .format(n_samples, n_clusters)) if isinstance(init, str) and init == 'k-means++': - centers = _k_init(X, k, random_state=random_state, + centers = _k_init(X, n_clusters, random_state=random_state, x_squared_norms=x_squared_norms) elif isinstance(init, str) and init == 'random': - seeds = random_state.permutation(n_samples)[:k] + seeds = random_state.permutation(n_samples)[:n_clusters] centers = X[seeds] elif hasattr(init, '__array__'): # ensure that the centers have the same dtype as X # this is a requirement of fused types of cython centers = np.array(init, dtype=X.dtype) elif callable(init): - centers = init(X, k, random_state=random_state) + centers = init(X, n_clusters, random_state=random_state) centers = np.asarray(centers, dtype=X.dtype) else: raise ValueError("the init parameter for the k-means should " @@ -643,7 +730,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, if sp.issparse(centers): centers = centers.toarray() - _validate_center_shape(X, k, centers) + _validate_center_shape(X, n_clusters, centers) return centers @@ -659,20 +746,22 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): The number of clusters to form as well as the number of centroids to generate. - init : {'k-means++', 'random'} or ndarray of shape \ - (n_clusters, n_features), default='k-means++' - Method for initialization, defaults to 'k-means++': + init : {'k-means++', 'random', ndarray, callable}, default='k-means++' + Method for initialization: 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. - 'random': choose k observations (rows) at random from data for - the initial centroids. + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. + n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of @@ -686,8 +775,10 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. - precompute_distances : 'auto' or bool, default='auto' + precompute_distances : {'auto', True, False}, default='auto' Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_clusters > 12 @@ -698,6 +789,10 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): False : never precompute distances. + .. deprecated:: 0.23 + 'precompute_distances' was deprecated in version 0.22 and will be + removed in 0.25. It has no effect. + verbose : int, default=0 Verbosity mode. @@ -708,26 +803,34 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): copy_x : bool, default=True When pre-computing distances it is more numerically accurate to center - the data first. If copy_x is True (default), then the original data is - not modified, ensuring X is C-contiguous. If False, the original data - is modified, and put back before the function returns, but small - numerical differences may be introduced by subtracting and then adding - the data mean, in this case it will also not ensure that data is - C-contiguous which may cause a significant slowdown. + the data first. If copy_x is True (default), then the original data is + not modified. If False, the original data is modified, and put back + before the function returns, but small numerical differences may be + introduced by subtracting and then adding the data mean. Note that if + the original data is not C-contiguous, a copy will be made even if + copy_x is False. If the original data is sparse, but not in CSR format, + a copy will be made even if copy_x is False. n_jobs : int, default=None - The number of jobs to use for the computation. This works by computing - each of the n_init runs in parallel. + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + ``None`` or ``-1`` means using all processors. + + .. deprecated:: 0.23 + ``n_jobs`` was deprecated in version 0.23 and will be removed in + 0.25. algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". - The "elkan" variation is more efficient by using the triangle - inequality, but currently doesn't support sparse data. "auto" chooses - "elkan" for dense data and "full" for sparse data. + The "elkan" variation is more efficient on data with well-defined + clusters, by using the triangle inequality. However it's more memory + intensive due to the allocation of an extra array of shape + (n_samples, n_clusters). + + For now "auto" (kept for backward compatibiliy) chooses "elkan" but it + might change in the future for a better heuristic. Attributes ---------- @@ -745,7 +848,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): n_iter_ : int Number of iterations run. - See Also + See also -------- MiniBatchKMeans @@ -794,9 +897,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): """ def __init__(self, n_clusters=8, init='k-means++', n_init=10, - max_iter=300, tol=1e-4, precompute_distances='auto', + max_iter=300, tol=1e-4, precompute_distances='deprecated', verbose=0, random_state=None, copy_x=True, - n_jobs=None, algorithm='auto'): + n_jobs='deprecated', algorithm='auto'): self.n_clusters = n_clusters self.init = init @@ -811,7 +914,8 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, self.algorithm = algorithm def _check_test_data(self, X): - X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES) + X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], + order='C', accept_large_sparse=False) n_samples, n_features = X.shape expected_n_features = self.cluster_centers_.shape[1] if not n_features == expected_n_features: @@ -826,17 +930,19 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- @@ -845,6 +951,19 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) + if self.precompute_distances != 'deprecated': + warnings.warn("'precompute_distances' was deprecated in version " + "0.23 and will be removed in 0.25. It has no " + "effect", FutureWarning) + + if self.n_jobs != 'deprecated': + warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" + " removed in 0.25.", FutureWarning) + self._n_threads = self.n_jobs + else: + self._n_threads = None + self._n_threads = _openmp_effective_n_threads(self._n_threads) + n_init = self.n_init if n_init <= 0: raise ValueError("Invalid number of initializations." @@ -856,10 +975,8 @@ def fit(self, X, y=None, sample_weight=None): ' got %d instead' % self.max_iter ) - # avoid forcing order when copy_x=False - order = "C" if self.copy_x else None X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], - order=order, copy=self.copy_x) + order='C', copy=self.copy_x, accept_large_sparse=False) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( @@ -867,28 +984,10 @@ def fit(self, X, y=None, sample_weight=None): tol = _tolerance(X, self.tol) - # If the distances are precomputed every job will create a matrix of - # shape (n_clusters, n_samples). To stop KMeans from eating up memory - # we only activate this if the created matrix is guaranteed to be - # under 100MB. 12 million entries consume a little under 100MB if they - # are of type double. - precompute_distances = self.precompute_distances - if precompute_distances == 'auto': - n_samples = X.shape[0] - precompute_distances = (self.n_clusters * n_samples) < 12e6 - elif isinstance(precompute_distances, bool): - pass - else: - raise ValueError( - "precompute_distances should be 'auto' or True/False" - ", but a value of %r was passed" % - precompute_distances - ) - # Validate init array init = self.init if hasattr(init, '__array__'): - init = check_array(init, dtype=X.dtype.type, copy=True) + init = check_array(init, dtype=X.dtype.type, copy=True, order='C') _validate_center_shape(X, self.n_clusters, init) if n_init != 1: @@ -911,59 +1010,43 @@ def fit(self, X, y=None, sample_weight=None): x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None + algorithm = self.algorithm - if self.n_clusters == 1: - # elkan doesn't make sense for a single cluster, full will produce - # the right result. + if algorithm == "elkan" and self.n_clusters == 1: + warnings.warn("algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", RuntimeWarning) algorithm = "full" + if algorithm == "auto": - algorithm = "full" if sp.issparse(X) else 'elkan' + algorithm = "full" if self.n_clusters == 1 else "elkan" + if algorithm == "full": kmeans_single = _kmeans_single_lloyd elif algorithm == "elkan": kmeans_single = _kmeans_single_elkan else: raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" - " %s" % str(algorithm)) + " {}".format(str(algorithm))) + # seeds for the initializations of the kmeans runs. seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) - if effective_n_jobs(self.n_jobs) == 1: - # For a single thread, less memory is needed if we just store one - # set of the best results (as opposed to one set per run per - # thread). + + # limit number of threads in second level of nested parallelism + # (i.e. BLAS) to avoid oversubsciption. + with threadpool_limits(limits=1, user_api="blas"): for seed in seeds: # run a k-means once labels, inertia, centers, n_iter_ = kmeans_single( - X, sample_weight, self.n_clusters, - max_iter=self.max_iter, init=init, verbose=self.verbose, - precompute_distances=precompute_distances, tol=tol, - x_squared_norms=x_squared_norms, random_state=seed) + X, sample_weight, self.n_clusters, max_iter=self.max_iter, + init=init, verbose=self.verbose, tol=tol, + x_squared_norms=x_squared_norms, random_state=seed, + n_threads=self._n_threads) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ - else: - # parallelisation of k-means runs - results = Parallel(n_jobs=self.n_jobs, verbose=0)( - delayed(kmeans_single)( - X, sample_weight, self.n_clusters, - max_iter=self.max_iter, init=init, - verbose=self.verbose, tol=tol, - precompute_distances=precompute_distances, - x_squared_norms=x_squared_norms, - # Change seed to ensure variety - random_state=seed - ) - for seed in seeds) - # Get results with the lowest inertia - labels, inertia, centers, n_iters = zip(*results) - best = np.argmin(inertia) - best_labels = labels[best] - best_inertia = inertia[best] - best_centers = centers[best] - best_n_iter = n_iters[best] if not sp.issparse(X): if not self.copy_x: @@ -976,8 +1059,7 @@ def fit(self, X, y=None, sample_weight=None): "Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, self.n_clusters), - ConvergenceWarning, stacklevel=2 - ) + ConvergenceWarning, stacklevel=2) self.cluster_centers_ = best_centers self.labels_ = best_labels @@ -999,13 +1081,13 @@ def fit_predict(self, X, y=None, sample_weight=None): y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- - labels : array, shape [n_samples,] + labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ return self.fit(X, sample_weight=sample_weight).labels_ @@ -1023,13 +1105,13 @@ def fit_transform(self, X, y=None, sample_weight=None): y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- - X_new : array, shape [n_samples, k] + X_new : array of shape (n_samples, n_clusters) X transformed in the new space. """ # Currently, this just skips a copy of the data if it is not in @@ -1052,7 +1134,7 @@ def transform(self, X): Returns ------- - X_new : array, shape [n_samples, k] + X_new : ndarray of shape (n_samples, n_clusters) X transformed in the new space. """ check_is_fitted(self) @@ -1076,21 +1158,22 @@ def predict(self, X, sample_weight=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) New data to predict. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- - labels : array, shape [n_samples,] + labels : ndarray of shape (n_samples,) Index of the cluster each sample belongs to. """ check_is_fitted(self) X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) + return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[0] + self.cluster_centers_, self._n_threads)[0] def score(self, X, y=None, sample_weight=None): """Opposite of the value of X on the K-means objective. @@ -1103,9 +1186,9 @@ def score(self, X, y=None, sample_weight=None): y : Ignored Not used, present here for API consistency by convention. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- @@ -1116,6 +1199,7 @@ def score(self, X, y=None, sample_weight=None): X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) + return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] @@ -1188,8 +1272,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, """ # Perform label assignment to nearest centers nearest_center, inertia = _labels_inertia(X, sample_weight, - x_squared_norms, centers, - distances=distances) + x_squared_norms, centers) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) @@ -1224,7 +1307,7 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, # implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): - return inertia, _k_means._mini_batch_update_csr( + return inertia, _mini_batch_update_csr( X, sample_weight, x_squared_norms, centers, weight_sums, nearest_center, old_center_buffer, compute_squared_diff) @@ -1728,6 +1811,13 @@ def partial_fit(self, X, y=None, sample_weight=None): 10 * (1 + self.counts_.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) + # Raise error if partial_fit called on data with different number + # of features. + if X.shape[1] != self.cluster_centers_.shape[1]: + raise ValueError( + "Number of features %d does not match previous " + "data %d." % (X.shape[1], self.cluster_centers_.shape[1])) + _mini_batch_step(X, sample_weight, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, dtype=X.dtype), 0, diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index a0ee8e62853c1..48ed25c5c0eaf 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -13,6 +13,7 @@ def configuration(parent_package='', top_path=None): libraries.append('m') config = Configuration('cluster', parent_package, top_path) + config.add_extension('_dbscan_inner', sources=['_dbscan_inner.pyx'], include_dirs=[numpy.get_include()], @@ -24,14 +25,19 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) - config.add_extension('_k_means_elkan', - sources=['_k_means_elkan.pyx'], + config.add_extension('_k_means_fast', + sources=['_k_means_fast.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) - config.add_extension('_k_means_fast', - sources=['_k_means_fast.pyx'], - include_dirs=numpy.get_include(), + config.add_extension('_k_means_lloyd', + sources=['_k_means_lloyd.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) + + config.add_extension('_k_means_elkan', + sources=['_k_means_elkan.pyx'], + include_dirs=[numpy.get_include()], libraries=libraries) config.add_subpackage('tests') diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 7dd54416f0b04..a31e61dd2423d 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -67,8 +67,7 @@ def test_spectral_coclustering(): 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], - 'n_init': [10], - 'n_jobs': [1]} + 'n_init': [10]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) @@ -253,3 +252,16 @@ def test_wrong_shape(): data = np.arange(27).reshape((3, 3, 3)) with pytest.raises(ValueError): model.fit(data) + + +@pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering]) +@pytest.mark.parametrize("n_jobs", [None, 1]) +def test_n_jobs_deprecated(klass, n_jobs): + # FIXME: remove in 0.25 + depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " + "in 0.25.") + S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0) + est = klass(random_state=0, n_jobs=n_jobs) + + with pytest.warns(FutureWarning, match=depr_msg): + est.fit(S) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 16e7c73e01423..2bcbc3faa517f 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -3,6 +3,7 @@ import numpy as np from scipy import sparse as sp +from threadpoolctl import threadpool_limits import pytest @@ -12,18 +13,24 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import if_safe_multiprocessing_with_blas from sklearn.utils._testing import assert_raise_message from sklearn.utils.validation import _num_samples from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import row_norms +from sklearn.metrics import pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score from sklearn.cluster import KMeans, k_means from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _labels_inertia from sklearn.cluster._kmeans import _mini_batch_step +from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense +from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse +from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper +from sklearn.cluster._k_means_fast import _euclidean_sparse_dense_wrapper +from sklearn.cluster._k_means_fast import _inertia_dense +from sklearn.cluster._k_means_fast import _inertia_sparse from sklearn.datasets import make_blobs from io import StringIO from sklearn.metrics.cluster import homogeneity_score @@ -42,10 +49,8 @@ X_csr = sp.csr_matrix(X) -@pytest.mark.parametrize("representation, algo", - [('dense', 'full'), - ('dense', 'elkan'), - ('sparse', 'full')]) +@pytest.mark.parametrize("representation", ["dense", "sparse"]) +@pytest.mark.parametrize("algo", ["full", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended @@ -68,8 +73,70 @@ def test_kmeans_results(representation, algo, dtype): assert kmeans.n_iter_ == expected_n_iter +@pytest.mark.parametrize("array_constr", + [np.array, sp.csr_matrix], + ids=['dense', 'sparse']) +@pytest.mark.parametrize("algo", ['full', 'elkan']) +def test_relocated_clusters(array_constr, algo): + # check that empty clusters are relocated as expected + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.25 + expected_centers = [[0.25, 0], [0.75, 1]] + expected_n_iter = 3 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_almost_equal(kmeans.inertia_, expected_inertia) + assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.parametrize("representation", ["dense", "sparse"]) +def test_relocate_empty_clusters(representation): + # test for the _relocate_empty_clusters_(dense/sparse) helpers + + # Synthetic dataset with 3 obvious clusters of different sizes + X = np.array( + [-10., -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) + if representation == "sparse": + X = sp.csr_matrix(X) + sample_weight = np.full(shape=10, fill_value=1.) + + # centers all initialized to the first point of X + centers_old = np.array([-10., -10, -10]).reshape(-1, 1) + + # With this initialization, all points will be assigned to the first center + # At this point a center in centers_new is the weighted sum of the points + # it contains if it's not empty, otherwise it is the same as before. + centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) + weight_in_clusters = np.array([10., 0, 0]) + labels = np.zeros(10, dtype=np.int32) + + if representation == "dense": + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + else: + _relocate_empty_clusters_sparse(X.data, X.indices, X.indptr, + sample_weight, centers_old, + centers_new, weight_in_clusters, + labels) + + # The relocation scheme will take the 2 points farthest from the center and + # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The + # first center will be updated to contain the other 8 points. + assert_array_equal(weight_in_clusters, [8, 1, 1]) + assert_allclose(centers_new, [[-36], [10], [9.5]]) + + @pytest.mark.parametrize('distribution', ['normal', 'blobs']) -@pytest.mark.parametrize('tol', [0, 1e-2, 1e-4, 1e-8]) +@pytest.mark.parametrize('tol', [1e-2, 1e-4, 1e-8]) def test_elkan_results(distribution, tol): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) @@ -85,16 +152,47 @@ def test_elkan_results(distribution, tol): km_full.fit(X) km_elkan.fit(X) - assert_array_almost_equal(km_elkan.cluster_centers_, - km_full.cluster_centers_) + assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_) - # The number of iterations and inertia should be close but not - # necessarily exactly the same because of rounding errors. - assert km_elkan.n_iter_ == pytest.approx(km_full.n_iter_, rel=0.01) + assert km_elkan.n_iter_ == km_full.n_iter_ assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6) +@pytest.mark.parametrize('algorithm', ['full', 'elkan']) +def test_kmeans_convergence(algorithm): + # Check that KMeans stops when convergence is reached when tol=0. (#16075) + rnd = np.random.RandomState(0) + X = rnd.normal(size=(5000, 10)) + + km = KMeans(algorithm=algorithm, n_clusters=5, random_state=0, n_init=1, + tol=0, max_iter=300).fit(X) + + assert km.n_iter_ < 300 + + +@pytest.mark.parametrize('distribution', ['normal', 'blobs']) +def test_elkan_results_sparse(distribution): + # check that results are identical between lloyd and elkan algorithms + # with sparse input + rnd = np.random.RandomState(0) + if distribution == 'normal': + X = sp.random(100, 100, density=0.1, format='csr', random_state=rnd) + X.data = rnd.randn(len(X.data)) + else: + X, _ = make_blobs(n_samples=100, n_features=100, random_state=rnd) + X = sp.csr_matrix(X) + + km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) + km_elkan = KMeans(algorithm='elkan', n_clusters=5, + random_state=0, n_init=1) + + km_full.fit(X) + km_elkan.fit(X) + assert_allclose(km_elkan.cluster_centers_, km_full.cluster_centers_) + assert_allclose(km_elkan.labels_, km_full.labels_) + + def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation @@ -237,33 +335,6 @@ def test_k_means_new_centers(): np.testing.assert_array_equal(this_labels, labels) -@if_safe_multiprocessing_with_blas -def test_k_means_plus_plus_init_2_jobs(): - km = KMeans(init="k-means++", n_clusters=n_clusters, n_jobs=2, - random_state=42).fit(X) - _check_fitted_model(km) - - -def test_k_means_precompute_distances_flag(): - # check that a warning is raised if the precompute_distances flag is not - # supported - km = KMeans(precompute_distances="wrong") - with pytest.raises(ValueError): - km.fit(X) - - -def test_k_means_plus_plus_init_not_precomputed(): - km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42, - precompute_distances=False).fit(X) - _check_fitted_model(km) - - -def test_k_means_random_init_not_precomputed(): - km = KMeans(init="random", n_clusters=n_clusters, random_state=42, - precompute_distances=False).fit(X) - _check_fitted_model(km) - - @pytest.mark.parametrize('data', [X, X_csr], ids=['dense', 'sparse']) @pytest.mark.parametrize('init', ['random', 'k-means++', centers.copy()]) def test_k_means_init(data, init): @@ -320,8 +391,7 @@ def test_k_means_fortran_aligned_data(): X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) - km = KMeans(n_init=1, init=centers, precompute_distances=False, - random_state=42, n_clusters=2) + km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2) km.fit(X) assert_array_almost_equal(km.cluster_centers_, centers) assert_array_equal(km.labels_, labels) @@ -349,20 +419,24 @@ def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") - if not (algo == 'elkan' and constructor is sp.csr_matrix): - rng = np.random.RandomState(seed) - X = make_blobs(n_samples=1000, n_features=10, centers=10, - random_state=rng)[0].astype(dtype, copy=False) - X = constructor(X) + rng = np.random.RandomState(seed) + + X = make_blobs(n_samples=1000, n_features=10, centers=10, + random_state=rng)[0].astype(dtype, copy=False) + X = constructor(X) - kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, - tol=tol, max_iter=max_iter, n_jobs=1) + kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, + tol=tol, max_iter=max_iter) - labels_1 = kmeans.fit(X).predict(X) - labels_2 = kmeans.fit_predict(X) + labels_1 = kmeans.fit(X).predict(X) + labels_2 = kmeans.fit_predict(X) - assert_array_equal(labels_1, labels_2) + # Due to randomness in the order in which chunks of data are processed when + # using more than one thread, the absolute values of the labels can be + # different between the 2 strategies but they should correspond to the same + # clustering. + assert v_measure_score(labels_1, labels_2) == 1 def test_mb_kmeans_verbose(): @@ -673,7 +747,7 @@ def test_fit_transform(): @pytest.mark.parametrize('algo', ['full', 'elkan']) def test_predict_equal_labels(algo): - km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, + km = KMeans(random_state=13, n_init=1, max_iter=1, algorithm=algo) km.fit(X) assert_array_equal(km.predict(X), km.labels_) @@ -733,11 +807,6 @@ def test_k_means_function(): with pytest.raises(ValueError): k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None) - # kmeans for algorithm='elkan' raises TypeError on sparse matrix - assert_raise_message(TypeError, "algorithm='elkan' not supported for " - "sparse input X", k_means, X=X_csr, n_clusters=2, - sample_weight=None, algorithm="elkan") - def test_x_squared_norms_init_centroids(): # Test that x_squared_norms can be None in _init_centroids @@ -780,8 +849,7 @@ def test_float_precision(Estimator, is_sparse): X_new[dtype] = estimator.transform(X_test) centers[dtype] = estimator.cluster_centers_ # ensure the extracted row is a 2d array - assert (estimator.predict(X_test[:1]) == - estimator.labels_[0]) + assert estimator.predict(X_test[:1]) == estimator.labels_[0] if hasattr(estimator, 'partial_fit'): estimator.partial_fit(X_test[0:3]) # dtype of cluster centers has to stay the same after @@ -966,11 +1034,136 @@ def test_minibatch_kmeans_partial_fit_int_data(): assert km.cluster_centers_.dtype.kind == "f" -def test_result_of_kmeans_equal_in_diff_n_jobs(): - # PR 9288 +def test_result_of_kmeans_equal_in_diff_n_threads(): + # Check that KMeans gives the same results in parallel mode than in + # sequential mode. rnd = np.random.RandomState(0) X = rnd.normal(size=(50, 10)) - result_1 = KMeans(n_clusters=3, random_state=0, n_jobs=1).fit(X).labels_ - result_2 = KMeans(n_clusters=3, random_state=0, n_jobs=2).fit(X).labels_ + with threadpool_limits(limits=1, user_api="openmp"): + result_1 = KMeans( + n_clusters=3, random_state=0).fit(X).labels_ + with threadpool_limits(limits=2, user_api="openmp"): + result_2 = KMeans( + n_clusters=3, random_state=0).fit(X).labels_ assert_array_equal(result_1, result_2) + + +@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) +def test_precompute_distance_deprecated(precompute_distances): + # FIXME: remove in 0.25 + depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " + "will be removed in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, + precompute_distances=precompute_distances) + + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) + + +@pytest.mark.parametrize("n_jobs", [None, 1]) +def test_n_jobs_deprecated(n_jobs): + # FIXME: remove in 0.25 + depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " + "in 0.25.") + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, + n_jobs=n_jobs) + + with pytest.warns(FutureWarning, match=depr_msg): + kmeans.fit(X) + + +def test_warning_elkan_1_cluster(): + X, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=0) + kmeans = KMeans(n_clusters=1, n_init=1, init='random', random_state=0, + algorithm='elkan') + + with pytest.warns(RuntimeWarning, + match="algorithm='elkan' doesn't make sense for a single" + " cluster"): + kmeans.fit(X) + + +def test_error_wrong_algorithm(): + X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) + kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, + algorithm='wrong') + + with pytest.raises(ValueError, + match="Algorithm must be 'auto', 'full' or 'elkan'"): + kmeans.fit(X) + + +@pytest.mark.parametrize("array_constr", + [np.array, sp.csr_matrix], + ids=['dense', 'sparse']) +@pytest.mark.parametrize("algo", ['full', 'elkan']) +def test_k_means_1_iteration(array_constr, algo): + # check the results after a single iteration (E-step M-step E-step) by + # comparing against a pure python implementation. + X = np.random.RandomState(0).uniform(size=(100, 5)) + init_centers = X[:5] + X = array_constr(X) + + def py_kmeans(X, init): + new_centers = init.copy() + labels = pairwise_distances_argmin(X, init) + for label in range(init.shape[0]): + new_centers[label] = X[labels == label].mean(axis=0) + labels = pairwise_distances_argmin(X, new_centers) + return labels, new_centers + + py_labels, py_centers = py_kmeans(X, init_centers) + + cy_kmeans = KMeans(n_clusters=5, n_init=1, init=init_centers, + algorithm=algo, max_iter=1).fit(X) + cy_labels = cy_kmeans.labels_ + cy_centers = cy_kmeans.cluster_centers_ + + assert_array_equal(py_labels, cy_labels) + assert_allclose(py_centers, cy_centers) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("squared", [True, False]) +def test_euclidean_distance(dtype, squared): + rng = np.random.RandomState(0) + a_sparse = sp.random(1, 100, density=0.5, format="csr", random_state=rng, + dtype=dtype) + a_dense = a_sparse.toarray().reshape(-1) + b = rng.randn(100).astype(dtype, copy=False) + b_squared_norm = (b**2).sum() + + expected = ((a_dense - b)**2).sum() + expected = expected if squared else np.sqrt(expected) + + distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared) + distance_sparse_dense = _euclidean_sparse_dense_wrapper( + a_sparse.data, a_sparse.indices, b, b_squared_norm, squared) + + assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=1e-6) + assert_allclose(distance_dense_dense, expected, rtol=1e-6) + assert_allclose(distance_sparse_dense, expected, rtol=1e-6) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_inertia(dtype): + rng = np.random.RandomState(0) + X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng, + dtype=dtype) + X_dense = X_sparse.toarray() + sample_weight = rng.randn(100).astype(dtype, copy=False) + centers = rng.randn(5, 10).astype(dtype, copy=False) + labels = rng.randint(5, size=100, dtype=np.int32) + + distances = ((X_dense - centers[labels])**2).sum(axis=1) + expected = np.sum(distances * sample_weight) + + inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels) + inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels) + + assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) + assert_allclose(inertia_dense, expected, rtol=1e-6) + assert_allclose(inertia_sparse, expected, rtol=1e-6) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 4b0ac978c0c69..d8c180dfd40a9 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -51,20 +51,20 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): Parameters ---------- transformers : list of tuples - List of (name, transformer, column(s)) tuples specifying the + List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. - name : string + name : str Like in Pipeline and FeatureUnion, this allows the transformer and its parameters to be set using ``set_params`` and searched in grid search. - transformer : estimator or {'passthrough', 'drop'} + transformer : {'drop', 'passthrough'} or estimator Estimator must support :term:`fit` and :term:`transform`. Special-cased strings 'drop' and 'passthrough' are accepted as well, to indicate to drop the columns or to pass them through untransformed, respectively. - column(s) : string or int, array-like of string or int, slice, \ -boolean mask array or callable + columns : str, array-like of str, int, array-like of int, \ + array-like of bool, slice or callable Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where @@ -74,7 +74,7 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): above. To select multiple columns by name or dtype, you can use :obj:`make_column_transformer`. - remainder : {'drop', 'passthrough'} or estimator, default 'drop' + remainder : {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). @@ -88,25 +88,25 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order. - sparse_threshold : float, default = 0.3 + sparse_threshold : float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - transformer_weights : dict, optional + transformer_weights : dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. @@ -124,12 +124,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): ``len(transformers_)==len(transformers)+1``, otherwise ``len(transformers_)==len(transformers)``. - named_transformers_ : Bunch object, a dictionary with attribute access + named_transformers_ : Bunch Read-only attribute to access any transformer by given name. Keys are transformer names and values are the fitted transformer objects. - sparse_output_ : boolean + sparse_output_ : bool Boolean flag indicating whether the output of ``transform`` is a sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. @@ -206,13 +206,13 @@ def get_params(self, deep=True): Parameters ---------- - deep : boolean, optional + deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- - params : mapping of string to any + params : dict Parameter names mapped to their values. """ return self._get_params('_transformers', deep=deep) @@ -467,11 +467,11 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like or DataFrame of shape [n_samples, n_features] + X : {array-like, dataframe} of shape (n_samples, n_features) Input data, of which specified subsets are used to fit the transformers. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples,...), default=None Targets for supervised learning. Returns @@ -490,16 +490,17 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : array-like or DataFrame of shape [n_samples, n_features] + X : {array-like, dataframe} of shape (n_samples, n_features) Input data, of which specified subsets are used to fit the transformers. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples,), default=None Targets for supervised learning. Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : {array-like, sparse matrix} of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. If any result is a sparse matrix, everything will be converted to @@ -545,12 +546,13 @@ def transform(self, X): Parameters ---------- - X : array-like or DataFrame of shape [n_samples, n_features] + X : {array-like, dataframe} of shape (n_samples, n_features) The data to be transformed by subset. Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : {array-like, sparse matrix} of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. If any result is a sparse matrix, everything will be converted to @@ -603,7 +605,7 @@ def _hstack(self, Xs): Parameters ---------- - Xs : List of numpy arrays, sparse arrays, or DataFrames + Xs : list of {array-like, sparse matrix, dataframe} """ if self.sparse_output_: try: @@ -670,16 +672,16 @@ def make_column_transformer(*transformers, **kwargs): Parameters ---------- *transformers : tuples - Tuples of the form (transformer, column(s)) specifying the + Tuples of the form (transformer, columns) specifying the transformer objects to be applied to subsets of the data. - transformer : estimator or {'passthrough', 'drop'} + transformer : {'drop', 'passthrough'} or estimator Estimator must support :term:`fit` and :term:`transform`. Special-cased strings 'drop' and 'passthrough' are accepted as well, to indicate to drop the columns or to pass them through untransformed, respectively. - column(s) : string or int, array-like of string or int, slice, \ -boolean mask array or callable + columns : str, array-like of str, int, array-like of int, slice, \ + array-like of bool or callable Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where @@ -688,7 +690,7 @@ def make_column_transformer(*transformers, **kwargs): A callable is passed the input data `X` and can return any of the above. - remainder : {'drop', 'passthrough'} or estimator, default 'drop' + remainder : {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). @@ -700,7 +702,7 @@ def make_column_transformer(*transformers, **kwargs): non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. - sparse_threshold : float, default = 0.3 + sparse_threshold : float, default=0.3 If the transformed output consists of a mix of sparse and dense data, it will be stacked as a sparse matrix if the density is lower than this value. Use ``sparse_threshold=0`` to always return dense. @@ -708,13 +710,13 @@ def make_column_transformer(*transformers, **kwargs): the stacked result will be sparse or dense, respectively, and this keyword will be ignored. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 50a44cdb42b9a..aad8050cb689e 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -42,9 +42,10 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): Parameters ---------- - regressor : object, default=LinearRegression() + regressor : object, default=None Regressor object such as derived from ``RegressorMixin``. This regressor will automatically be cloned each time prior to fitting. + If regressor is ``None``, ``LinearRegression()`` is created and used. transformer : object, default=None Estimator object such as derived from ``TransformerMixin``. Cannot be @@ -54,13 +55,13 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): transformer will be cloned during fitting. Also, the transformer is restricting ``y`` to be a numpy array. - func : function, optional + func : function, default=None Function to apply to ``y`` before passing to ``fit``. Cannot be set at the same time as ``transformer``. The function needs to return a 2-dimensional array. If ``func`` is ``None``, the function used will be the identity function. - inverse_func : function, optional + inverse_func : function, default=None Function to apply to the prediction of the regressor. Cannot be set at the same time as ``transformer`` as well. The function needs to return a 2-dimensional array. The inverse function is used to return @@ -153,14 +154,14 @@ def fit(self, X, y, **fit_params): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,) + y : array-like of shape (n_samples,) Target values. - **fit_params : dict of string -> object + **fit_params : dict Parameters passed to the ``fit`` method of the underlying regressor. @@ -215,7 +216,7 @@ def predict(self, X): Returns ------- - y_hat : array, shape = (n_samples,) + y_hat : ndarray of shape (n_samples,) Predicted values. """ diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 98aecc4a43db8..da1bd0dddf529 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -230,6 +230,14 @@ def transform(self, X): return U + def _more_tags(self): + return { + '_xfail_test': { + "check_methods_subset_invariance": + "fails for the transform method" + } + } + class MiniBatchSparsePCA(SparsePCA): """Mini-batch Sparse Principal Components Analysis diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 369214757a858..da986f900ab8e 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -11,7 +11,6 @@ import warnings import numpy as np -from .exceptions import ChangedBehaviorWarning from scipy import linalg from scipy.special import expit @@ -24,6 +23,7 @@ from .utils.multiclass import check_classification_targets from .utils.extmath import softmax from .preprocessing import StandardScaler +from .utils.validation import _deprecate_positional_args __all__ = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis'] @@ -246,8 +246,8 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, >>> print(clf.predict([[-0.8, -1]])) [1] """ - - def __init__(self, solver='svd', shrinkage=None, priors=None, + @_deprecate_positional_args + def __init__(self, *, solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4): self.solver = solver self.shrinkage = shrinkage @@ -618,8 +618,8 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): sklearn.discriminant_analysis.LinearDiscriminantAnalysis: Linear Discriminant Analysis """ - - def __init__(self, priors=None, reg_param=0., store_covariance=False, + @_deprecate_positional_args + def __init__(self, *, priors=None, reg_param=0., store_covariance=False, tol=1.0e-4): self.priors = np.asarray(priors) if priors is not None else None self.reg_param = reg_param diff --git a/sklearn/dummy.py b/sklearn/dummy.py index a2b06d1985c94..6fb9b21711930 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -18,7 +18,7 @@ from .utils.stats import _weighted_percentile from .utils.multiclass import class_distribution from .utils import deprecated - +from .utils.validation import _deprecate_positional_args class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): """ @@ -98,8 +98,8 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): >>> dummy_clf.score(X, y) 0.75 """ - - def __init__(self, strategy="warn", random_state=None, + @_deprecate_positional_args + def __init__(self, *, strategy="warn", random_state=None, constant=None): self.strategy = strategy self.random_state = random_state @@ -354,7 +354,13 @@ def predict_log_proba(self, X): return [np.log(p) for p in proba] def _more_tags(self): - return {'poor_score': True, 'no_validation': True} + return { + 'poor_score': True, 'no_validation': True, + '_xfail_test': { + 'check_methods_subset_invariance': + 'fails for the predict method' + } + } def score(self, X, y, sample_weight=None): """Returns the mean accuracy on the given test data and labels. @@ -453,8 +459,8 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): >>> dummy_regr.score(X, y) 0.0 """ - - def __init__(self, strategy="mean", constant=None, quantile=None): + @_deprecate_positional_args + def __init__(self, *, strategy="mean", constant=None, quantile=None): self.strategy = strategy self.constant = constant self.quantile = quantile diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index f9dd56d833d7a..8533c84ef5e88 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -82,7 +82,7 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, print("Building estimator %d of %d for this parallel run " "(total %d)..." % (i + 1, n_estimators, total_n_estimators)) - random_state = np.random.RandomState(seeds[i]) + random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) @@ -405,9 +405,8 @@ def _get_estimators_indices(self): for seed in self._seeds: # Operations accessing random_state must be performed identically # to those in `_parallel_build_estimators()` - random_state = np.random.RandomState(seed) feature_indices, sample_indices = _generate_bagging_indices( - random_state, self.bootstrap_features, self.bootstrap, + seed, self.bootstrap_features, self.bootstrap, self.n_features_, self._n_samples, self._max_features, self._max_samples) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 0e69c0c8d14be..883f0067f5e78 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -878,3 +878,25 @@ def test_bagging_small_max_features(): bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) bagging.fit(X, y) + + +def test_bagging_get_estimators_indices(): + # Check that Bagging estimator can generate sample indices properly + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16436 + + rng = np.random.RandomState(0) + X = rng.randn(13, 4) + y = np.arange(13) + + class MyEstimator(DecisionTreeRegressor): + """An estimator which stores y indices information at fit.""" + def fit(self, X, y): + self._sample_indices = y + + clf = BaggingRegressor(base_estimator=MyEstimator(), + n_estimators=1, random_state=0) + clf.fit(X, y) + + assert_array_equal(clf.estimators_[0]._sample_indices, + clf.estimators_samples_[0]) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 600c29eafafae..111f0512216f7 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -6,12 +6,13 @@ import numpy as np from scipy import interpolate from scipy.stats import spearmanr +import warnings +import math + from .base import BaseEstimator, TransformerMixin, RegressorMixin from .utils import check_array, check_consistent_length -from .utils.validation import _check_sample_weight +from .utils.validation import _check_sample_weight, _deprecate_positional_args from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique -import warnings -import math __all__ = ['check_increasing', 'isotonic_regression', @@ -198,7 +199,8 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): >>> iso_reg.predict([.1, .2]) array([1.8628..., 3.7256...]) """ - def __init__(self, y_min=None, y_max=None, increasing=True, + @_deprecate_positional_args + def __init__(self, *, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'): self.y_min = y_min self.y_max = y_max diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index aa94e83fc492d..f15bf508f8dad 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -19,7 +19,7 @@ from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS -from .utils.validation import check_non_negative +from .utils.validation import check_non_negative, _deprecate_positional_args class RBFSampler(TransformerMixin, BaseEstimator): @@ -81,8 +81,8 @@ class RBFSampler(TransformerMixin, BaseEstimator): Benjamin Recht. (https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf) """ - - def __init__(self, gamma=1., n_components=100, random_state=None): + @_deprecate_positional_args + def __init__(self, *, gamma=1., n_components=100, random_state=None): self.gamma = gamma self.n_components = n_components self.random_state = random_state @@ -187,8 +187,8 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator): sklearn.metrics.pairwise.chi2_kernel : The exact chi squared kernel. """ - - def __init__(self, skewedness=1., n_components=100, random_state=None): + @_deprecate_positional_args + def __init__(self, *, skewedness=1., n_components=100, random_state=None): self.skewedness = skewedness self.n_components = n_components self.random_state = random_state @@ -318,8 +318,8 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence, 2011 """ - - def __init__(self, sample_steps=2, sample_interval=None): + @_deprecate_positional_args + def __init__(self, *, sample_steps=2, sample_interval=None): self.sample_steps = sample_steps self.sample_interval = sample_interval @@ -534,8 +534,8 @@ class Nystroem(TransformerMixin, BaseEstimator): sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels. """ - - def __init__(self, kernel="rbf", gamma=None, coef0=None, degree=None, + @_deprecate_positional_args + def __init__(self, kernel="rbf", *, gamma=None, coef0=None, degree=None, kernel_params=None, n_components=100, random_state=None): self.kernel = kernel self.gamma = gamma diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index aa8c100718b74..421cbd800f38a 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -229,6 +229,8 @@ def mean_absolute_percentage_error(y_true, y_pred, weighted average of all output errors is returned. MAPE output is non-negative floating point. The best value is 0.0. + But note the fact that bad predictions can lead to arbitarily large + MAPE values, especially if some y_true values are very close to zero. Examples -------- diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 9ec4ccf2e848f..36097720acdfd 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -623,8 +623,9 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, greater_is_better=False) neg_mean_absolute_error_scorer = make_scorer(mean_absolute_error, greater_is_better=False) -neg_mape_scorer = make_scorer(mean_absolute_percentage_error, - greater_is_better=False) +neg_mean_absolute_percentage_error_scorer = make_scorer( + mean_absolute_percentage_error, greater_is_better=False +) neg_median_absolute_error_scorer = make_scorer(median_absolute_error, greater_is_better=False) neg_root_mean_squared_error_scorer = make_scorer(mean_squared_error, @@ -684,35 +685,35 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, fowlkes_mallows_scorer = make_scorer(fowlkes_mallows_score) -SCORERS = dict(explained_variance=explained_variance_scorer, - r2=r2_scorer, - max_error=max_error_scorer, - neg_median_absolute_error=neg_median_absolute_error_scorer, - neg_mean_absolute_error=neg_mean_absolute_error_scorer, - neg_mean_absolute_percentage_error=neg_mape_scorer, - neg_mean_squared_error=neg_mean_squared_error_scorer, - neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, - neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, - neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer, - neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer, - accuracy=accuracy_scorer, roc_auc=roc_auc_scorer, - roc_auc_ovr=roc_auc_ovr_scorer, - roc_auc_ovo=roc_auc_ovo_scorer, - roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer, - roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer, - balanced_accuracy=balanced_accuracy_scorer, - average_precision=average_precision_scorer, - neg_log_loss=neg_log_loss_scorer, - neg_brier_score=neg_brier_score_scorer, - # Cluster metrics that use supervised evaluation - adjusted_rand_score=adjusted_rand_scorer, - homogeneity_score=homogeneity_scorer, - completeness_score=completeness_scorer, - v_measure_score=v_measure_scorer, - mutual_info_score=mutual_info_scorer, - adjusted_mutual_info_score=adjusted_mutual_info_scorer, - normalized_mutual_info_score=normalized_mutual_info_scorer, - fowlkes_mallows_score=fowlkes_mallows_scorer) +SCORERS = dict( + explained_variance=explained_variance_scorer, + max_error=max_error_scorer, + neg_median_absolute_error=neg_median_absolute_error_scorer, + neg_mean_absolute_error=neg_mean_absolute_error_scorer, + neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, + neg_mean_squared_error=neg_mean_squared_error_scorer, + neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, + neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, + neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer, + neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer, + accuracy=accuracy_scorer, roc_auc=roc_auc_scorer, + roc_auc_ovr=roc_auc_ovr_scorer, + roc_auc_ovo=roc_auc_ovo_scorer, + roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer, + roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer, + balanced_accuracy=balanced_accuracy_scorer, + average_precision=average_precision_scorer, + neg_log_loss=neg_log_loss_scorer, + neg_brier_score=neg_brier_score_scorer, + # Cluster metrics that use supervised evaluation + adjusted_rand_score=adjusted_rand_scorer, + homogeneity_score=homogeneity_scorer, + completeness_score=completeness_scorer, + v_measure_score=v_measure_scorer, + mutual_info_score=mutual_info_scorer, + adjusted_mutual_info_score=adjusted_mutual_info_scorer, + normalized_mutual_info_score=normalized_mutual_info_scorer, + fowlkes_mallows_score=fowlkes_mallows_scorer) for name, metric in [('precision', precision_score), diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 9eb0c42227ad1..3f2ba83b474c7 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1376,6 +1376,10 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name): if metric == mean_absolute_percentage_error: assert np.isfinite(current_score) assert current_score > 1e6 + # Here we are not comparing the values in case of MAPE because + # whenever y_true value is exactly zero, the MAPE value doesn't + # signify anything. Thus, in this case we are just expecting + # very large finite value. else: assert_almost_equal(score, current_score) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 573f66c36511c..33a8b0f30cfd6 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -323,3 +323,10 @@ def test_tweedie_deviance_continuity(): assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10), mean_tweedie_deviance(y_true, y_pred, power=2), atol=1e-6) + + +def test_mean_absolute_percentage_error(): + random_number_generator = np.random.RandomState(42) + y_true = random_number_generator.exponential(size=100) + y_pred = 1.2 * y_true + assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(20.) diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index da31859477122..38673eb4ab8bd 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -372,3 +372,11 @@ def fit(self, X, y=None): begin = end return self + + def _more_tags(self): + return { + '_xfail_test': { + 'check_methods_subset_invariance': + 'fails for the decision_function method' + } + } diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 0750e5a4c1ae2..1d96900555400 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -828,6 +828,15 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale', break_ties=break_ties, random_state=random_state) + def _more_tags(self): + return { + '_xfail_test': { + 'check_methods_subset_invariance': + 'fails for the decision_function method', + 'check_class_weight_classifiers': 'class_weight is ignored.' + } + } + class SVR(RegressorMixin, BaseLibSVM): """Epsilon-Support Vector Regression. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b9fa122e2c888..508356155aaf7 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -20,7 +20,7 @@ from sklearn.utils import all_estimators from sklearn.utils._testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.estimator_checks import check_estimator, _safe_tags import sklearn from sklearn.base import BiclusterMixin @@ -87,12 +87,19 @@ def _tested_estimators(): @parametrize_with_checks(_tested_estimators()) -def test_estimators(estimator, check): +def test_estimators(estimator, check, request): # Common tests for estimator instances with ignore_warnings(category=(FutureWarning, ConvergenceWarning, UserWarning, FutureWarning)): _set_checking_parameters(estimator) + + xfail_checks = _safe_tags(estimator, '_xfail_test') + check_name = _set_check_estimator_ids(check) + if xfail_checks: + if check_name in xfail_checks: + msg = xfail_checks[check_name] + request.applymarker(pytest.mark.xfail(reason=msg)) check(estimator) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 427752b5e8a4a..d01666366a283 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1668,7 +1668,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): >>> reg = BaggingRegressor(extra_tree, random_state=0).fit( ... X_train, y_train) >>> reg.score(X_test, y_test) - 0.7788... + 0.7447... """ def __init__(self, criterion="mse", diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 8dcfff68f7cad..ef9a4b1ca17f6 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1032,13 +1032,6 @@ def check_methods_subset_invariance(name, estimator_orig): msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) - # TODO remove cases when corrected - if (name, method) in [('NuSVC', 'decision_function'), - ('SparsePCA', 'transform'), - ('MiniBatchSparsePCA', 'transform'), - ('DummyClassifier', 'predict'), - ('BernoulliRBM', 'score_samples')]: - raise SkipTest(msg) if hasattr(estimator, method): result_full, result_by_batch = _apply_on_subsets( @@ -2243,13 +2236,6 @@ def check_regressors_no_decision_function(name, regressor_orig): @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig): - if name == "NuSVC": - # the sparse version has a parameter that doesn't do anything - raise SkipTest("Not testing NuSVC class weight as it is ignored.") - if name.endswith("NB"): - # NaiveBayes classifiers have a somewhat different interface. - # FIXME SOON! - raise SkipTest if _safe_tags(classifier_orig, 'binary_only'): problems = [2] diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 7fdd9168a50e3..debbbebbfe204 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -25,6 +25,7 @@ ctypedef fused integral: ctypedef np.float64_t DOUBLE + def csr_row_norms(X): """L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: @@ -38,19 +39,18 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, np.ndarray[integral, ndim=1, mode="c"] X_indptr): cdef: unsigned long long n_samples = shape[0] - unsigned long long n_features = shape[1] - np.ndarray[DOUBLE, ndim=1, mode="c"] norms - - np.npy_intp i, j + unsigned long long i + integral j double sum_ - norms = np.zeros(n_samples, dtype=np.float64) + norms = np.empty(n_samples, dtype=X_data.dtype) + cdef floating[::1] norms_view = norms for i in range(n_samples): sum_ = 0.0 for j in range(X_indptr[i], X_indptr[i + 1]): sum_ += X_data[j] * X_data[j] - norms[i] = sum_ + norms_view[i] = sum_ return norms diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 443a4a3229311..2891079bc3e32 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -16,7 +16,8 @@ count_nonzero, csc_median_axis_0) from sklearn.utils.sparsefuncs_fast import (assign_rows_csr, inplace_csr_row_normalize_l1, - inplace_csr_row_normalize_l2) + inplace_csr_row_normalize_l2, + csr_row_norms) from sklearn.utils._testing import assert_allclose @@ -544,3 +545,16 @@ def test_inplace_normalize(): if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: X_csr.data **= 2 assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_csr_row_norms(dtype): + # checks that csr_row_norms returns the same output as + # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype. + X = sp.random(100, 10, format='csr', dtype=dtype) + + scipy_norms = sp.linalg.norm(X, axis=1)**2 + norms = csr_row_norms(X) + + assert norms.dtype == dtype + assert_allclose(norms, scipy_norms) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index ff9d8b169cb0c..08952d6cbcd16 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1290,7 +1290,7 @@ def inner_f(*args, **kwargs): args_msg = ['{}={}'.format(name, arg) for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])] - warnings.warn("Pass {} as keyword args. From version 0.24 " + warnings.warn("Pass {} as keyword args. From version 0.25 " "passing these as positional arguments will " "result in an error".format(", ".join(args_msg)), FutureWarning) From c6c7ba9545158aeaeb5a867ad3f44f320f1d65f1 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 21 Feb 2020 11:48:53 +0530 Subject: [PATCH 059/103] Resolving r2_scorer object error --- sklearn/metrics/_scorer.py | 1 + sklearn/metrics/tests/test_score_objects.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 36097720acdfd..cb52e1f8b0821 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -688,6 +688,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, SCORERS = dict( explained_variance=explained_variance_scorer, max_error=max_error_scorer, + r2_score=r2_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index ccc32723302ec..31ad2698dc7ca 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -41,7 +41,7 @@ from sklearn.multiclass import OneVsRestClassifier -REGRESSION_SCORERS = ['explained_variance', 'r2', +REGRESSION_SCORERS = ['explained_variance', 'r2_score', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', @@ -350,7 +350,7 @@ def test_regression_scorers(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = Ridge() clf.fit(X_train, y_train) - score1 = get_scorer('r2')(clf, X_test, y_test) + score1 = get_scorer('r2_score')(clf, X_test, y_test) score2 = r2_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2) From 2a59c178fc07a5e9501733d08b4a117b821f811c Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 21 Feb 2020 12:09:13 +0530 Subject: [PATCH 060/103] Resolving r2_scorer object error --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 67b66b6a91431..7983cc9a6a4f5 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -371,7 +371,7 @@ def test_cross_validate(): for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): # It's okay to evaluate regression metrics on classification too mse_scorer = check_scoring(est, 'neg_mean_squared_error') - r2_scorer = check_scoring(est, 'r2') + r2_scorer = check_scoring(est, 'r2_score') train_mse_scores = [] test_mse_scores = [] train_r2_scores = [] @@ -661,7 +661,7 @@ def test_cross_val_score_with_score_func_regression(): # R2 score (aka. determination coefficient) - should be the # same as the default estimator score - r2_scores = cross_val_score(reg, X, y, scoring="r2") + r2_scores = cross_val_score(reg, X, y, scoring="r2_score") assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative From 67f857ef6bee5edd1f981e1a324ed794e46a25ba Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 21 Feb 2020 12:54:07 +0530 Subject: [PATCH 061/103] Resolving r2_scorer object error --- sklearn/model_selection/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 7983cc9a6a4f5..c996bf30ee338 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -422,12 +422,12 @@ def check_cross_validate_single_metric(clf, X, y, scores): # Single metric passed as a list if return_train_score: # It must be True by default - deprecated - r2_scores_dict = cross_validate(clf, X, y, scoring=['r2'], + r2_scores_dict = cross_validate(clf, X, y, scoring=['r2_score'], return_train_score=True) assert_array_almost_equal(r2_scores_dict['train_r2'], train_r2_scores, True) else: - r2_scores_dict = cross_validate(clf, X, y, scoring=['r2'], + r2_scores_dict = cross_validate(clf, X, y, scoring=['r2_score'], return_train_score=False) assert isinstance(r2_scores_dict, dict) assert len(r2_scores_dict) == dict_len @@ -446,8 +446,8 @@ def check_cross_validate_multi_metric(clf, X, y, scores): # Test multimetric evaluation when scoring is a list / dict (train_mse_scores, test_mse_scores, train_r2_scores, test_r2_scores, fitted_estimators) = scores - all_scoring = (('r2', 'neg_mean_squared_error'), - {'r2': make_scorer(r2_score), + all_scoring = (('r2_score', 'neg_mean_squared_error'), + {'r2_score': make_scorer(r2_score), 'neg_mean_squared_error': 'neg_mean_squared_error'}) keys_sans_train = {'test_r2', 'test_neg_mean_squared_error', From cb0a63507a7d5cbdfb39d73b0144f9fc961afd17 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Fri, 21 Feb 2020 13:46:48 +0530 Subject: [PATCH 062/103] Resolving r2_scorer object error --- sklearn/metrics/_scorer.py | 2 +- sklearn/metrics/tests/test_score_objects.py | 4 ++-- sklearn/model_selection/tests/test_validation.py | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index cb52e1f8b0821..72f059cf07a7d 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -688,7 +688,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, SCORERS = dict( explained_variance=explained_variance_scorer, max_error=max_error_scorer, - r2_score=r2_scorer, + r2=r2_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 31ad2698dc7ca..ccc32723302ec 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -41,7 +41,7 @@ from sklearn.multiclass import OneVsRestClassifier -REGRESSION_SCORERS = ['explained_variance', 'r2_score', +REGRESSION_SCORERS = ['explained_variance', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_log_error', @@ -350,7 +350,7 @@ def test_regression_scorers(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = Ridge() clf.fit(X_train, y_train) - score1 = get_scorer('r2_score')(clf, X_test, y_test) + score1 = get_scorer('r2')(clf, X_test, y_test) score2 = r2_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index c996bf30ee338..67b66b6a91431 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -371,7 +371,7 @@ def test_cross_validate(): for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): # It's okay to evaluate regression metrics on classification too mse_scorer = check_scoring(est, 'neg_mean_squared_error') - r2_scorer = check_scoring(est, 'r2_score') + r2_scorer = check_scoring(est, 'r2') train_mse_scores = [] test_mse_scores = [] train_r2_scores = [] @@ -422,12 +422,12 @@ def check_cross_validate_single_metric(clf, X, y, scores): # Single metric passed as a list if return_train_score: # It must be True by default - deprecated - r2_scores_dict = cross_validate(clf, X, y, scoring=['r2_score'], + r2_scores_dict = cross_validate(clf, X, y, scoring=['r2'], return_train_score=True) assert_array_almost_equal(r2_scores_dict['train_r2'], train_r2_scores, True) else: - r2_scores_dict = cross_validate(clf, X, y, scoring=['r2_score'], + r2_scores_dict = cross_validate(clf, X, y, scoring=['r2'], return_train_score=False) assert isinstance(r2_scores_dict, dict) assert len(r2_scores_dict) == dict_len @@ -446,8 +446,8 @@ def check_cross_validate_multi_metric(clf, X, y, scores): # Test multimetric evaluation when scoring is a list / dict (train_mse_scores, test_mse_scores, train_r2_scores, test_r2_scores, fitted_estimators) = scores - all_scoring = (('r2_score', 'neg_mean_squared_error'), - {'r2_score': make_scorer(r2_score), + all_scoring = (('r2', 'neg_mean_squared_error'), + {'r2': make_scorer(r2_score), 'neg_mean_squared_error': 'neg_mean_squared_error'}) keys_sans_train = {'test_r2', 'test_neg_mean_squared_error', @@ -661,7 +661,7 @@ def test_cross_val_score_with_score_func_regression(): # R2 score (aka. determination coefficient) - should be the # same as the default estimator score - r2_scores = cross_val_score(reg, X, y, scoring="r2_score") + r2_scores = cross_val_score(reg, X, y, scoring="r2") assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) # Mean squared error; this is a loss function, so "scores" are negative From a03affef33838e8c402c7d2697ee9cc0700d82eb Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Mon, 24 Feb 2020 14:01:26 +0530 Subject: [PATCH 063/103] Added changes of conflict --- doc/whats_new/v0.23.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index eda4d14f7aad2..feaab78181336 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -259,17 +259,15 @@ Changelog :mod:`sklearn.metrics` ...................... -<<<<<<< HEAD - |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and the associated scorer for regression problems. :issue:`10708` fixed with the PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and some practical test cases were taken from PR :pr:`10711` by :user:`Mohamed Ali Jamaoui `. -======= + - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows its ``reduce_func`` to not have a return value, enabling in-place operations. :pr:`16397` by `Joel Nothman`_. ->>>>>>> upstream/master - |Fix| Fixed a bug in :func:`metrics.mean_squared_error` to not ignore argument `squared` when argument `multioutput='raw_values'`. From f9fe4a42c5ab4a54d89bd09a2c75e86c6c5bedb4 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 25 Feb 2020 12:44:45 +0530 Subject: [PATCH 064/103] Update sklearn/metrics/_regression.py Co-Authored-By: Thomas J Fan --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 421cbd800f38a..9e162a47e5ea8 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -200,7 +200,7 @@ def mean_absolute_percentage_error(y_true, y_pred, Parameters ---------- - y_true : array-like of shape = (n_samples,) or (n_samples, n_outputs) + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples,) or (n_samples, n_outputs) From 723f116a0bcd880d35cd0c0c7517cc6881a6dd6d Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 25 Feb 2020 12:45:08 +0530 Subject: [PATCH 065/103] Update sklearn/metrics/_regression.py Co-Authored-By: Thomas J Fan --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 9e162a47e5ea8..7361ce4a697f3 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -203,7 +203,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. - y_pred : array-like of shape = (n_samples,) or (n_samples, n_outputs) + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape = (n_samples,), optional From 156e6336f7a92235f3cdd302149a51d4cca4c5a1 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 25 Feb 2020 12:45:32 +0530 Subject: [PATCH 066/103] Update sklearn/metrics/_regression.py Co-Authored-By: Thomas J Fan --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 7361ce4a697f3..9b1fe6e0e1b76 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -206,7 +206,7 @@ def mean_absolute_percentage_error(y_true, y_pred, y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. - sample_weight : array-like of shape = (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. multioutput : string in ['raw_values', 'uniform_average'] or list. From 92af06b86a53dfe2d917fb283f1ad790c14d8aaf Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Tue, 25 Feb 2020 12:45:45 +0530 Subject: [PATCH 067/103] Update sklearn/metrics/_regression.py Co-Authored-By: Thomas J Fan --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 9b1fe6e0e1b76..b626da5414e51 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -209,7 +209,7 @@ def mean_absolute_percentage_error(y_true, y_pred, sample_weight : array-like of shape (n_samples,), default=None Sample weights. - multioutput : string in ['raw_values', 'uniform_average'] or list. + multioutput : {'raw_values', 'uniform_average'} or array-like Defines aggregating of multiple output values. Array-like value defines weights used to average errors. If input is list then the shape must be (n_outputs,). From dd709a9f008ee5b5050ea0abdeaeab963dc0a5c5 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 27 Feb 2020 18:38:38 +0530 Subject: [PATCH 068/103] Modified Files to optimize changes --- doc/modules/model_evaluation.rst | 2 +- doc/whats_new/v0.23.rst | 3 +- sklearn/metrics/_scorer.py | 59 ++++++++++++++++---------------- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8864151afd9eb..6b0bfd7381680 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1973,7 +1973,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as .. math:: - \text{MAPE}(y, \hat{y}) = \frac{\mid y - \hat{y} \mid}{\text{MAX}(\epsilon, \mid y \mid)} + \text{MAPE}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1} \frac{{}\left| y_i - \hat{y}_i \right|}{max(\epsilon, \left| y_i \right|)} where :math:`\epsilon` is an arbitrary small yet strictly positive number to avoid undefined results when y is zero. diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index feaab78181336..cd283d6393bd4 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -261,8 +261,7 @@ Changelog - |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and the associated scorer for regression problems. :issue:`10708` fixed with the - PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and - some practical test cases were taken from PR :pr:`10711` by + PR :pr:`15007` by :user:`Ashutosh Hathidara ` and :user:`Mohamed Ali Jamaoui `. - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 72f059cf07a7d..9b5d49c242946 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -685,36 +685,35 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, fowlkes_mallows_scorer = make_scorer(fowlkes_mallows_score) -SCORERS = dict( - explained_variance=explained_variance_scorer, - max_error=max_error_scorer, - r2=r2_scorer, - neg_median_absolute_error=neg_median_absolute_error_scorer, - neg_mean_absolute_error=neg_mean_absolute_error_scorer, - neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, - neg_mean_squared_error=neg_mean_squared_error_scorer, - neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, - neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, - neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer, - neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer, - accuracy=accuracy_scorer, roc_auc=roc_auc_scorer, - roc_auc_ovr=roc_auc_ovr_scorer, - roc_auc_ovo=roc_auc_ovo_scorer, - roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer, - roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer, - balanced_accuracy=balanced_accuracy_scorer, - average_precision=average_precision_scorer, - neg_log_loss=neg_log_loss_scorer, - neg_brier_score=neg_brier_score_scorer, - # Cluster metrics that use supervised evaluation - adjusted_rand_score=adjusted_rand_scorer, - homogeneity_score=homogeneity_scorer, - completeness_score=completeness_scorer, - v_measure_score=v_measure_scorer, - mutual_info_score=mutual_info_scorer, - adjusted_mutual_info_score=adjusted_mutual_info_scorer, - normalized_mutual_info_score=normalized_mutual_info_scorer, - fowlkes_mallows_score=fowlkes_mallows_scorer) +SCORERS = dict(explained_variance=explained_variance_scorer, + max_error=max_error_scorer, + r2=r2_scorer, + neg_median_absolute_error=neg_median_absolute_error_scorer, + neg_mean_absolute_error=neg_mean_absolute_error_scorer, + neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, # noqa + neg_mean_squared_error=neg_mean_squared_error_scorer, + neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, + neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, + neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer, + neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer, + accuracy=accuracy_scorer, roc_auc=roc_auc_scorer, + roc_auc_ovr=roc_auc_ovr_scorer, + roc_auc_ovo=roc_auc_ovo_scorer, + roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer, + roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer, + balanced_accuracy=balanced_accuracy_scorer, + average_precision=average_precision_scorer, + neg_log_loss=neg_log_loss_scorer, + neg_brier_score=neg_brier_score_scorer, + # Cluster metrics that use supervised evaluation + adjusted_rand_score=adjusted_rand_scorer, + homogeneity_score=homogeneity_scorer, + completeness_score=completeness_scorer, + v_measure_score=v_measure_scorer, + mutual_info_score=mutual_info_scorer, + adjusted_mutual_info_score=adjusted_mutual_info_scorer, + normalized_mutual_info_score=normalized_mutual_info_scorer, + fowlkes_mallows_score=fowlkes_mallows_scorer) for name, metric in [('precision', precision_score), From ea99cffb434aaebdaf54c750810159b8caa4ebb9 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 27 Feb 2020 18:44:31 +0530 Subject: [PATCH 069/103] Modified Files to optimize changes --- sklearn/metrics/_scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 9b5d49c242946..633a1d0c79d1d 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -686,8 +686,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, SCORERS = dict(explained_variance=explained_variance_scorer, - max_error=max_error_scorer, r2=r2_scorer, + max_error=max_error_scorer, neg_median_absolute_error=neg_median_absolute_error_scorer, neg_mean_absolute_error=neg_mean_absolute_error_scorer, neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, # noqa From 2c064030b100be1e94962fd6bf3fa5e12ce4fdef Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 27 Feb 2020 18:59:30 +0530 Subject: [PATCH 070/103] Changed description of contributors info --- doc/whats_new/v0.23.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 3e5e09bf7dd77..414c566a82e76 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -269,7 +269,8 @@ Changelog - |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and the associated scorer for regression problems. :issue:`10708` fixed with the - PR :pr:`15007` by :user:`Ashutosh Hathidara ` and + PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and + some practical test cases were taken from PR :pr:`10711` by :user:`Mohamed Ali Jamaoui `. - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows From 291720ea0a76a3b101ea4bd7c7c9694dc4a74282 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 4 Mar 2020 13:49:47 +0100 Subject: [PATCH 071/103] DOC improve diabetes dataset description (#16534) --- sklearn/datasets/descr/diabetes.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/datasets/descr/diabetes.rst b/sklearn/datasets/descr/diabetes.rst index f2adc8d192b6c..771b3e5fe282a 100644 --- a/sklearn/datasets/descr/diabetes.rst +++ b/sklearn/datasets/descr/diabetes.rst @@ -17,16 +17,16 @@ quantitative measure of disease progression one year after baseline. :Target: Column 11 is a quantitative measure of disease progression one year after baseline :Attribute Information: - - Age - - Sex - - Body mass index - - Average blood pressure - - S1 - - S2 - - S3 - - S4 - - S5 - - S6 + - age age in years + - sex + - bmi body mass index + - bp average blood pressure + - s1 tc, T-Cells (a type of white blood cells) + - s2 ldl, low-density lipoproteins + - s3 hdl, high-density lipoproteins + - s4 tch, thyroid stimulating hormone + - s5 ltg, lamotrigine + - s6 glu, blood sugar level Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1). From 6578fb4e9e6aa04bdd3963a4a8e7afce85c353af Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 4 Mar 2020 13:52:13 +0100 Subject: [PATCH 072/103] TST add test of fit attributes (#16286) --- sklearn/cluster/_mean_shift.py | 2 +- sklearn/covariance/_elliptic_envelope.py | 15 ++++ sklearn/cross_decomposition/_cca.py | 3 + sklearn/cross_decomposition/_pls.py | 3 + sklearn/decomposition/_dict_learning.py | 1 + sklearn/decomposition/_incremental_pca.py | 3 + sklearn/discriminant_analysis.py | 6 +- sklearn/ensemble/_gb.py | 15 ++++ sklearn/ensemble/_iforest.py | 3 + sklearn/isotonic.py | 3 + sklearn/linear_model/_ridge.py | 9 +- sklearn/linear_model/_stochastic_gradient.py | 5 +- sklearn/naive_bayes.py | 9 +- sklearn/tests/test_docstring_parameters.py | 94 ++++++++++++++++++-- sklearn/tests/test_naive_bayes.py | 2 +- sklearn/utils/_mocking.py | 1 - sklearn/utils/estimator_checks.py | 16 ++++ 17 files changed, 168 insertions(+), 22 deletions(-) diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index dc90967ebe5dc..32dd1d3ad4fe8 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -299,7 +299,7 @@ class MeanShift(ClusterMixin, BaseEstimator): cluster_centers_ : array, [n_clusters, n_features] Coordinates of cluster centers. - labels_ : + labels_ : array of shape (n_samples,) Labels of each point. n_iter_ : int diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 9b7c00efd53a5..801611943f350 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -67,6 +67,21 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + raw_location_ : ndarray of shape (n_features,) + The raw robust estimated location before correction and re-weighting. + + raw_covariance_ : ndarray of shape (n_features, n_features) + The raw robust estimated covariance before correction and re-weighting. + + raw_support_ : ndarray of shape (n_samples,) + A mask of the observations that have been used to compute + the raw robust estimates of location and shape, before correction + and re-weighting. + + dist_ : ndarray of shape (n_samples,) + Mahalanobis distances of the training set (on which :meth:`fit` is + called) observations. + Examples -------- >>> import numpy as np diff --git a/sklearn/cross_decomposition/_cca.py b/sklearn/cross_decomposition/_cca.py index 80fa41bc44149..bd2e933339228 100644 --- a/sklearn/cross_decomposition/_cca.py +++ b/sklearn/cross_decomposition/_cca.py @@ -55,6 +55,9 @@ class CCA(_UnstableArchMixin, _PLS): y_rotations_ : array, [q, n_components] Y block to latents rotations. + coef_ : array of shape (p, q) + The coefficients of the linear model: ``Y = X coef_ + Err`` + n_iter_ : array-like Number of iterations of the NIPALS inner loop for each component. diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index af81ece6baf58..88951d18468d8 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -721,6 +721,9 @@ class PLSCanonical(_PLS): y_rotations_ : array, shape = [q, n_components] Y block to latents rotations. + coef_ : array of shape (p, q) + The coefficients of the linear model: ``Y = X coef_ + Err`` + n_iter_ : array-like Number of iterations of the NIPALS inner loop for each component. Not useful if the algorithm provided is "svd". diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 49b78a0916e7a..9b7ad28f9f235 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1447,6 +1447,7 @@ def fit(self, X, y=None): # some online fitting (partial_fit) self.inner_stats_ = (A, B) self.iter_offset_ = self.n_iter + self.random_state_ = random_state return self def partial_fit(self, X, y=None, iter_offset=None): diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 2a0d19d373dbb..ac535b58e7f5e 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -104,6 +104,9 @@ class IncrementalPCA(_BasePCA): The number of samples processed by the estimator. Will be reset on new calls to fit, but increments across ``partial_fit`` calls. + batch_size_ : int + Inferred batch size from ``batch_size``. + Examples -------- >>> from sklearn.datasets import load_digits diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index a954081b380cb..2bd3948f2e013 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -190,7 +190,8 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, Intercept term. covariance_ : array-like of shape (n_features, n_features) - Covariance matrix (shared by all classes). + Covariance matrix (shared by all classes). Only available + `store_covariance` is True. explained_variance_ratio_ : ndarray of shape (n_components,) Percentage of variance explained by each of the selected components. @@ -579,7 +580,8 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): Attributes ---------- covariance_ : list of array-like of shape (n_features, n_features) - Covariance matrices of each class. + Covariance matrices of each class. Only available + `store_covariance` is True. means_ : array-like of shape (n_classes, n_features) Class means. diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 7e354cb720bbe..c9f0b69f57968 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1020,6 +1020,15 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): classes_ : ndarray of shape (n_classes,) The classes labels. + n_features_ : int + The number of data features. + + n_classes_ : int + The number of classes. + + max_features_ : int + The inferred value of max_features. + Notes ----- The features are always randomly permuted at each split. Therefore, @@ -1513,6 +1522,12 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, 1) The collection of fitted sub-estimators. + n_features_ : int + The number of data features. + + max_features_ : int + The inferred value of max_features. + Notes ----- The features are always randomly permuted at each split. Therefore, diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 6aa4dac35a156..501f2425541e8 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -145,6 +145,9 @@ class IsolationForest(OutlierMixin, BaseBagging): is defined in such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + Notes ----- The implementation is based on an ensemble of ExtraTreeRegressor. The diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 111f0512216f7..896044ae9cc6e 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -171,6 +171,9 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): f_ : function The stepwise interpolating function that covers the input domain ``X``. + increasing_ : bool + Inferred value for ``increasing``. + Notes ----- Ties are broken using the secondary method from Leeuw, 1977. diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1a9cb661318e9..c40f641df4b5e 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1705,10 +1705,11 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): ---------- cv_values_ : ndarray of shape (n_samples, n_alphas) or \ shape (n_samples, n_targets, n_alphas), optional - Cross-validation values for each alpha (if ``store_cv_values=True``\ - and ``cv=None``). After ``fit()`` has been called, this attribute \ - will contain the mean squared errors (by default) or the values \ - of the ``{loss,score}_func`` function (if provided in the constructor). + Cross-validation values for each alpha (only available if \ + ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been \ + called, this attribute will contain the mean squared errors \ + (by default) or the values of the ``{loss,score}_func`` function \ + (if provided in the constructor). coef_ : ndarray of shape (n_features) or (n_targets, n_features) Weight vector(s). diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 04bdba8135620..bca1928ecf481 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1485,14 +1485,15 @@ class SGDRegressor(BaseSGDRegressor): The intercept term. average_coef_ : ndarray of shape (n_features,) - Averaged weights assigned to the features. + Averaged weights assigned to the features. Only available + if ``average=True``. .. deprecated:: 0.23 Attribute ``average_coef_`` was deprecated in version 0.23 and will be removed in 0.25. average_intercept_ : ndarray of shape (1,) - The averaged intercept term. + The averaged intercept term. Only available if ``average=True``. .. deprecated:: 0.23 Attribute ``average_intercept_`` was deprecated diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c23cb86644e1b..bcd9da1cb72fc 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1147,18 +1147,19 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): return super().partial_fit(X, y, classes, sample_weight=sample_weight) + def _more_tags(self): + return {'requires_positive_X': True} + def _check_X(self, X): X = check_array(X, dtype='int', accept_sparse=False, force_all_finite=True) - if np.any(X < 0): - raise ValueError("X must not contain negative values.") + check_non_negative(X, "CategoricalNB (input X)") return X def _check_X_y(self, X, y): X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, force_all_finite=True) - if np.any(X < 0): - raise ValueError("X must not contain negative values.") + check_non_negative(X, "CategoricalNB (input X)") return X, y def _init_counters(self, n_effective_classes, n_features): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index dc5cac756fd67..55af69ca6c10e 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -9,14 +9,20 @@ from pkgutil import walk_packages from inspect import signature +import numpy as np + import sklearn from sklearn.utils import IS_PYPY from sklearn.utils._testing import check_docstring_parameters from sklearn.utils._testing import _get_func_name from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import all_estimators +from sklearn.utils.estimator_checks import _safe_tags +from sklearn.utils.estimator_checks import _enforce_estimator_tags_y +from sklearn.utils.estimator_checks import _enforce_estimator_tags_x from sklearn.utils.deprecation import _is_deprecated from sklearn.externals._pep562 import Pep562 +from sklearn.datasets import make_classification import pytest @@ -161,13 +167,87 @@ def test_tabs(): % modname) -@pytest.mark.parametrize('name, Classifier', - all_estimators(type_filter='classifier')) -def test_classifier_docstring_attributes(name, Classifier): - docscrape = pytest.importorskip('numpydoc.docscrape') +@pytest.mark.parametrize('name, Estimator', + all_estimators()) +def test_fit_docstring_attributes(name, Estimator): + pytest.importorskip('numpydoc') from numpydoc import docscrape - doc = docscrape.ClassDoc(Classifier) + doc = docscrape.ClassDoc(Estimator) attributes = doc['Attributes'] - assert attributes - assert 'classes_' in [att.name for att in attributes] + + IGNORED = {'ClassifierChain', 'ColumnTransformer', 'CountVectorizer', + 'DictVectorizer', 'FeatureUnion', 'GaussianRandomProjection', + 'GridSearchCV', 'MultiOutputClassifier', 'MultiOutputRegressor', + 'NoSampleWeightWrapper', 'OneVsOneClassifier', + 'OneVsRestClassifier', 'OutputCodeClassifier', 'Pipeline', + 'RFE', 'RFECV', 'RandomizedSearchCV', 'RegressorChain', + 'SelectFromModel', 'SparseCoder', 'SparseRandomProjection', + 'SpectralBiclustering', 'StackingClassifier', + 'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier', + 'VotingRegressor'} + if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'): + pytest.skip("Estimator cannot be fit easily to test fit attributes") + + est = Estimator() + + if Estimator.__name__ == 'SelectKBest': + est.k = 2 + + if Estimator.__name__ == 'DummyClassifier': + est.strategy = "stratified" + + X, y = make_classification(n_samples=20, n_features=3, + n_redundant=0, n_classes=2, + random_state=2) + + y = _enforce_estimator_tags_y(est, y) + X = _enforce_estimator_tags_x(est, X) + + if '1dlabels' in _safe_tags(est, 'X_types'): + est.fit(y) + elif '2dlabels' in _safe_tags(est, 'X_types'): + est.fit(np.c_[y, y]) + else: + est.fit(X, y) + + skipped_attributes = {'n_features_in_'} + + for attr in attributes: + if attr.name in skipped_attributes: + continue + desc = ' '.join(attr.desc).lower() + # As certain attributes are present "only" if a certain parameter is + # provided, this checks if the word "only" is present in the attribute + # description, and if not the attribute is required to be present. + if 'only ' not in desc: + assert hasattr(est, attr.name) + + IGNORED = {'BayesianRidge', 'Birch', 'CCA', 'CategoricalNB', 'ElasticNet', + 'ElasticNetCV', 'GaussianProcessClassifier', + 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor', 'IsolationForest', + 'KNeighborsClassifier', 'KNeighborsRegressor', + 'KNeighborsTransformer', 'KernelCenterer', 'KernelDensity', + 'LarsCV', 'Lasso', 'LassoLarsCV', 'LassoLarsIC', + 'LatentDirichletAllocation', 'LocalOutlierFactor', 'MDS', + 'MiniBatchKMeans', 'MLPClassifier', 'MLPRegressor', + 'MultiTaskElasticNet', 'MultiTaskElasticNetCV', + 'MultiTaskLasso', 'MultiTaskLassoCV', 'NearestNeighbors', + 'NuSVR', 'OAS', 'OneClassSVM', 'OrthogonalMatchingPursuit', + 'PLSCanonical', 'PLSRegression', 'PLSSVD', + 'PassiveAggressiveClassifier', 'Perceptron', 'RBFSampler', + 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RadiusNeighborsTransformer', 'RandomTreesEmbedding', 'SVR', + 'SkewedChi2Sampler'} + if Estimator.__name__ in IGNORED: + pytest.xfail( + reason="Classifier has too many undocumented attributes.") + + fit_attr = [k for k in est.__dict__.keys() if k.endswith('_') + and not k.startswith('_')] + fit_attr_names = [attr.name for attr in attributes] + undocumented_attrs = set(fit_attr).difference(fit_attr_names) + undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) + assert not undocumented_attrs,\ + "Undocumented attributes: {}".format(undocumented_attrs) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index b4470a7ed49e5..1f0f9347a188c 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -663,7 +663,7 @@ def test_categoricalnb(): # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) - error_msg = "X must not contain negative values." + error_msg = "Negative values in data passed to CategoricalNB (input X)" assert_raise_message(ValueError, error_msg, clf.predict, X) assert_raise_message(ValueError, error_msg, clf.fit, X, y) diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 07c7b9a70ca05..25b60f7955b99 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -65,7 +65,6 @@ class CheckingClassifier(ClassifierMixin, BaseEstimator): Attributes ---------- classes_ - """ def __init__(self, check_y=None, check_X=None, foo_param=0, expected_fit_params=None): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 21060b3462520..2cfb06c7994db 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2638,6 +2638,22 @@ def _enforce_estimator_tags_y(estimator, y): return y +def _enforce_estimator_tags_x(estimator, X): + # Estimators with a `_pairwise` tag only accept + # X of shape (`n_samples`, `n_samples`) + if hasattr(estimator, '_pairwise'): + X = X.dot(X.T) + # Estimators with `1darray` in `X_types` tag only accept + # X of shape (`n_samples`,) + if '1darray' in _safe_tags(estimator, 'X_types'): + X = X[:, 0] + # Estimators with a `requires_positive_X` tag only accept + # strictly positive data + if _safe_tags(estimator, 'requires_positive_X'): + X -= X.min() + return X + + @ignore_warnings(category=FutureWarning) def check_non_transformer_estimators_n_iter(name, estimator_orig): # Test that estimators that are not transformers with a parameter From b91501e65868a6466e6339c84a2134f2e3ad052d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 4 Mar 2020 14:08:24 +0100 Subject: [PATCH 073/103] ENH Minimal Generalized linear models implementation (L2 + lbfgs) (#14300) Co-authored-by: Christian Lorentzen Co-authored-by: Olivier Grisel Co-authored-by: Nicolas Hug --- doc/modules/classes.rst | 15 + .../poisson_gamma_tweedie_distributions.png | Bin 0 -> 63830 bytes doc/modules/linear_model.rst | 149 ++++- doc/whats_new/v0.23.rst | 7 + ...plot_poisson_regression_non_normal_loss.py | 455 +++++++++++++ ...lot_tweedie_regression_insurance_claims.py | 596 +++++++++++++++++ sklearn/_loss/__init__.py | 0 sklearn/_loss/glm_distribution.py | 355 ++++++++++ sklearn/_loss/tests/__init__.py | 0 sklearn/_loss/tests/test_glm_distribution.py | 112 ++++ sklearn/linear_model/__init__.py | 8 +- sklearn/linear_model/_glm/__init__.py | 15 + sklearn/linear_model/_glm/glm.py | 615 ++++++++++++++++++ sklearn/linear_model/_glm/link.py | 110 ++++ sklearn/linear_model/_glm/tests/__init__.py | 1 + sklearn/linear_model/_glm/tests/test_glm.py | 431 ++++++++++++ sklearn/linear_model/_glm/tests/test_link.py | 45 ++ sklearn/linear_model/setup.py | 2 + sklearn/metrics/_regression.py | 56 +- sklearn/metrics/tests/test_regression.py | 8 +- sklearn/setup.py | 2 + 21 files changed, 2925 insertions(+), 57 deletions(-) create mode 100644 doc/modules/glm_data/poisson_gamma_tweedie_distributions.png create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py create mode 100644 sklearn/_loss/__init__.py create mode 100644 sklearn/_loss/glm_distribution.py create mode 100644 sklearn/_loss/tests/__init__.py create mode 100644 sklearn/_loss/tests/test_glm_distribution.py create mode 100644 sklearn/linear_model/_glm/__init__.py create mode 100644 sklearn/linear_model/_glm/glm.py create mode 100644 sklearn/linear_model/_glm/link.py create mode 100644 sklearn/linear_model/_glm/tests/__init__.py create mode 100644 sklearn/linear_model/_glm/tests/test_glm.py create mode 100644 sklearn/linear_model/_glm/tests/test_link.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f5b7fe427873b..dd97a1dc6e594 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -837,6 +837,21 @@ Any estimator using the Huber loss would also be robust to outliers, e.g. linear_model.RANSACRegressor linear_model.TheilSenRegressor +Generalized linear models (GLM) for regression +---------------------------------------------- + +These models allow for response variables to have error distributions other +than a normal distribution: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + linear_model.PoissonRegressor + linear_model.TweedieRegressor + linear_model.GammaRegressor + + Miscellaneous ------------- diff --git a/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png new file mode 100644 index 0000000000000000000000000000000000000000..3b95b724a662389f0547e06049e65c729f1968e8 GIT binary patch literal 63830 zcmaHS2{@E*+rC}0Wyw;O?8$EIVoKKRMM$eT}h> zkuVrD|7Ys=e((1k|L^_gsDoKN_dVBrE$4Y&*Ar`Eq(guH(s>dR5_&yd%{wF{6jdZ7 zq_*d%fLGpHti1z%P(U^G%+3LikaJEkz;hZeT`MRF38Mq?KWQ_j_a!}_x=3Asuyc}- zTp`iZylEBy+nx&!fjMU%?G1T8H5&PBI3oN;;qlG;SKrjXxnpW?+Ic?i*;UdJ9h89HvPUXw{_8>Y$)uV50@FW_jaX(&{(l~_#iEfy z|9TSjj@dGm4WjS^0m}?knXCRS0#XGlRqW^YqvdM{MW+c6P>bT`aZywN(%=U^SE07( zZnYFsUkJqSsTaekrrA_EWE}P^w8I??x#tQQxLdUw(0leibBytKdsOH}s#8MHwZjQR zx}%IK$Bk*i4cS_$%qc3+fT6!Qf9lk-O*94SNJwysU8p_uSK@?=>s=r=BRLAQ6jDO6Zyg$P2svupXN?rk!MHX`2^4;=T`v4*>Yw8ln)mXvw31rBB-zr!R@*|flKVs;&HHP2nD?*rZ z;)-X6l_Mr#5P_Dh9U0-a6PB$FgdBeK>17(SPcl5oMdNg9JSY@MU%-{&zVL2{-#ci9 zOze&uXj9@>TBA$RxpuV`)rBWY%j-7q#{zH^ZlPeQ}qwpcPYyVJn?*&VVuH2HEb@ zGihmF%XmTs=mX=-{3W3 zNbr`zpgV$u?v{>2wf87X&;ae>i1xcJUDSj7hrEJFbmzJBSeIZahZi>7mD183p{g$6C4&j*BBm?hlVNEYP=s)p3u9r zRYTmg@TSJq-c94qA-yFskZLC)%pG$2Ot@_*Yl=GGB5p!t3UO_4n;lX=hq#Y$OV2#h z@v)Exsg59WF))q-Qm@Fk<>)TmV`EH7u zS=kG8*%Old&MGQ%N1>LNjM6(Te&Ptn0;%suFl0^ffr7gcc+r5J5qM>k7J+MKt{10d z(VPJweAbv^2#X_!Qy6>8f$@n}%3srP)ckoR)38z^#eUV$ntGf}=aH_Iyf zxT45)aC$w z60WHZB6w(!2ZoTtyi6Nb7jS+QbjJ_mDqpKOxGm<5?8_hAwzz3s=$Jw zJs}g7g^Lf4#6YT&okz=Royy9yYGGlD1HPQ(1Bmm8ZOHx#r#!Xq2*?!ml{#{WjM5Y} zu(nu-QL?a;s|Cy=jAP$nkw9`-nx4YkAY#utM*Ki6d}{}c6kF|}!qx$W;8+PXhb2T| z29a}qv!bhW*QG}s>RyF*1GNYQ7$;>AY)#VOxho;|6qsI*q%B#{sjW$oT*lNc!tDXs z%+6bAMLk$@``M7Go>Y7}#U+d)iI$)YfsT%Rtyj1bzOd+poLhp$e-4tX9T^N~4;NE1 zo_NRW=;`yjc`drvt5f;V0^;`_ahNgHFqmbJZ~kdgm00<;jF)G3BknzNmI)>j?g$Zh zcCzfZKS8I1u-j+->gL#nIBZc> z>d0kk#mixCzE>%2pX7qh=APT^;=UnZ^2ECb9yg5=2c3T6Zp&#=SUbRA9NL(YVe`ROZJj@BlI5DgXRa4W}G17#bJ34H;w?JGq{Cz6vw@M@ur(UiR#`Qbbh95 z@5Myhg@ya>l$g5rSk}ckcrs?tefQi$tNtGz>ObZ}3n^wAFN4#1PHg>J=m=pFG~;ay z;dx+iqiRaA=EL}N^F>9G{uDB9Wa#)0y;LZkdRSq|i zoRGy>`@{GKmd z1E)1O_IumrCYO}|{)r(LeMNuo-~>!6k~vl8CAPS@%%4J=;D?%xq)5~6kqdcvZ3yAF zL{MyBfo=VoS3#)^y3ym$*i*iSoSOu)FJG*$B!v-480qM8rMx;L>;N(R?2#5u`4bEHaE>|T?BsC>_epuN zZzZ{Hx81;J@^ligLhUOkx*~^R7%`vRkrpHo0{|MtOz?fFDlR*eB<(nFVu!X0rCmcP zwVhNVYQ4_Pt?eXy&xW!ixl$*1^R3MJ0Imh*gM~=3h};aTPf&CY&v?8@$|OHxn=B-o zz=roUeNMQ1mYkWr1e@Wi?eD^NjL^e5;i!l?uC|x2&7jDQ4?2~MIK6Sw-7dSB1nLTk z%{@EaWX7~phpJg+)2*~cW9F;~6B7j)OY}(ReoE*5R&dPHuze;d*m6yvQ+6Rnlpx=} ztRz#4AeE4DE}~wg4GIL~MJ>@Qrq12icu8s)uX&hMmGb8C4wm)}jHjMZJ-MUVR<9i3 z?x9#YcNw*N$oz`k96$CR*Cn&tn@UkXb=S?9+_*O|7jI!^#iQOA$icz0PcUIjh`WE- zb@MoS_>A-Wt>h0U@Nhogxz@wM#p{+F*9q4!Dq#jGLN~1>@dJy)#Roa@P}@G5e%qvL zEy_v@h6z)D78`*gL^FC^IllHjtmw!GJ4F-yw+msop&9>#W{MRNw7xWec5Uq;g(JyhclEytK8% zbaRUO+3I{IT%jetT-x$*OJ?`3fHYsJn}*4uli5^VXLAR&cqy*8bD_03{b*|feyFs% zv=T(i!Df!{dym_fAqYrD_HOmz=YnF-)NwAGvbaeo;o>$beNLR9AClhnIalQOcgHGl z#PH(&Z(Z1g_Hw21m*l+SiFrR{r~h3)c$2BK{SqVM$+1h7nl`+@FKn^Edno2}<*}Eg1oHYN=^n<-8K>{!uvxPMK~wl$#|9SY z>}AK594T-k;ZR)EjPxbvCoY z5|&UhGdOOOT=dW_1@Pz#lhgwp9QT+zR%%2iVgLqJeKsLQW$S}fW4G-*R$U;(fnyDJ zAT5Sisd|*P!9@MlrIrHT8hh9)CCQ%L2sGecr1m%5j?%y?CsQds_#d`j=xNMkqr zEyOQIWz?aLTDSssYG&b<=E<5Bj=fj8a~~R2;}W&dDe~+krE|t^+L1>HxAYf8|6mzc zxXyu|{X6cRcYEGA?k$0()o0GRUqot$Xwq6Z7DCZQOK=$PR%x#)GIC(gPB#2U zZM07Nm5)SGvxCSc9T@aT8Kr)1mNDeoqdE>(u(a%>$gBKnV)S7>t~UE}_!B`KRs&0k zdTeHSw&RgaZ2RO=c#>jJEoyJ-_bCio*ZDBKQ!&woE<|~KA9ocK zp{i2lAzwA3?B5uDrZaO%Wc&MS7ZlmC?zKPeJ(bfoKY2(qia7` zWvC}k+wN@n9E#TcUgz*Ibw=-=JO+XwoD;+zc_DX>+BAaCf@5?SS|d6Fy(%)*qKpkf zIdcu(@y?1Xv4Yv^Tb(Kg-+J+IV-ar9HCaa?1~dZBN6U<^|=WCsJEl`fDNc%dq(JhWT@0 zMg|X6`_6Y5@8NmEd&6_D^Mq5Yiz^_#?p#aRZ#ux-cMBbNfOs9W6Kz2YE!6IKU{fT!-bMFoKOL>=5gmCko1?`yZ*jY4?vgMSgehGr#^%w88g49NE6)B^7mo; z61EC!nM3e&*{vfe+v#WRA-lfFa+*{vUJzJ%N2pWwh{TTOq{@G0p^17f@#x1$I?8bw zHp4v|n;q}T@-F`9LLUSc6RVR$J|Wb8*L3YM>~@v$L>C>1wWU%zd~;h zKCiU+u&0+fr2&fMrt@geVMEoi!x^W!UB;sK9JfLlb2ZhURVwCO!9R!WsVDcrhMKMG zd)+B~`_6~7HJ%GcKkA-vw?hk$hb_T^ClNW_{^4QZFr~=K%qg90Xt=HysB}P3=nXs- zU;hGYcilKCsLlGOe>*P;Si1H#=Hj45%)9<+<)iEBAObxoiyC(sU(rAGY{$+X(&c#H zLFh9<(2dlHmI|4u-YNY1qw1q;ryQ7!%omvQ8vGALJNK+IdRMhZgvs{Ud60jH@^-6}5;E9q5HCT}v+~ptL0viepj-POPNZVmNH@3S2Pg1Hp z=-z?@(gK&yVDi|i=S80mRZ_5|^`xSnTg2P}S9vnk?Z)^YIZBm=d0BcUnt~MsGIbY- zi!#b~+kMI2GWCcwJ{!6acP-f;Y22WXiOLOEI^pt!#QHEYhUcoMHM_CngMRjQezp3v zSDw&CQ-&`a=;?mE@g}ZW9H;JUkv`V@m{T|^p$9mOLnW`Y%xrE+Y4F8U)SgH|<5|^U znGzDqd3kT}1?@9uZOU3YZCb@%M&nzjO6s7FE089s6=RRsd9C%q6}S>EVvS<&;!1L@ z4jahb^VKBHP(7=95~dGySTJXio6M=Wtr9|}dU7_YJ2CYQ&TG7#o#gZ|mI z_mu?pJ%2bp6|SJ^WUm(QS^kSMDM^)GlEBp-uUT-N<7ho_krtvL?B^)s%(&94kifmm z01}J8bc*uE!Ivg9+j8_rm4wx>^mtsm5>j2`=21%HGc-ypBAMFJ@%coS5t-xdZslCw zFO0H0YVtwk>U(CRGl$HqB@53kU$KKXWCW{RoqrwY)fkkB<9M zKu9_k+`WOHvJl%lFlv>_CsF4PsXUhzP2f$e7^3o_6W=#V3Y5~OPz=pWr(KdF;SOtz zn4ExkeRi#g`HgPDc8=`IfcFE-Cbym^dGvRTE<_$gHFRG}2p-!Cyr!!@Mq;6p{MI>a zZ6Dpk%OLfdMYTN*>Y*5R13TGVHsjrx`X+w&kTZ!79MDc}d66~RGfiv4S@~lDGXlEwJvxzqKRa=(`>t!GbO3eh^Odw=I~r`?rpv}5ZrS7>B2UBY-z3=b|j^NfP<;w%@l zB5#CtP{TOdosotY1%bn!{MKJrQLw}gU;$Zx<@+Y=)Yr3jEU^pgWF1&F$GFscuUUq} zi~}{x13G0~JXsvT-wWa9kV;86a3g3;9MOc_L10D(kJ5_(CPw+y7|lhRBU5{dS@Aey zX-Bszw%)Moh0B6D2(617bXR&2?VQ&a)?u{9x_2&Y@9W{VV-|bMjx#gWShrg!z@2ce z#U!Pbyligqecrwv(bRGn)RU@B3eRon2_d{BRCr=t69cg_mm8=DcxA!uD`0BHr~ZqT zw89?+@U}R`_JKu5h@T=J=S5$Y)o}DO21OM@TZejfQco8ix_3>@zl4XuO|f=iabZ9J zyT9ueB9&2P4u1>ryWHNcz6=W%-|O0XD1ujc$RD=-No)@_I}}X~_eZup2{ZU<{cNi+ z6w9Ny_lv#n>!CT0KKU)=5Q;lWkXr6#w;t|YNilmWnsaE9{NZ?U-K@|>xN4gQ`?%_8 z<@8nuNr*HgGgi~kHLfGLnU3%Rb1qCF0Tr37Qx0MQ&wOXY&8?~6Ui6n`xTljeEQyue z!^Q3eg_%MK}9NndSahXrbobvSNr!+*e>kJ3J;F`5p@XvlP2pu z^39$esE~T+Q^Cm|S9o#+3rj%qe6+Df-QsJP+}H+QTv;hV$D}&LtpL6pxpxge;O#Lg z+l0-uJ6e<~dl&WVb(Ev^1(6(#>yO^EEBKdRoI+fSwQ*W-#em?mdy%{&WlZ40(QK@W zoL|;l_dO2r(}Rp1WY5B)67%&OGDzJ?Un?|qndD(7AC zVR?r!oZQLX%ODFA^=XJPEh__l`pYzAZX%&FSyEt4MbJ|6={7*t7P=SgRajv&t`S=g z$2ngDWi`ACzm}7vx>mRExS)u0iwdVt@&#dDC~f6NRJy#|JJ|Kd^wmwe`?1ex=2a-> zr-k$4wZjC|y$^EXqHSlP5Aiu|c(vag8PSyC=ZKAol+U2r*L&(wk6{@Ch6< zYIb*obD{ct^9>Ku_9a+LOT(&`rsmfXjz9ei*+)t!+g`?J;kTL{Ij>AC?w%~XM&AH> z9xs{e;LOBJ-jsNK-##`5fY(4aj``<7_e4|=(BlAEKO=KT4&(w!7p_7cL+YoALU9z( z_@Mj{7xV4P$7W5zU_fH=LT28#>#M$F!_P14^Rbk2TpE{f-cHxa0hIMTm+JQrR#e`z z*Vyp7Kb)VHDW%iM7g&!=Yz%0s+%oG2QVodu!5S?1bF?#_7jqLe``FZ(kql7Dcy(`P z09TADkp%YX2rQ=Q^oX;}(LS3VCsVlH_x^#VrXW+PzNY3+OX5d31)q*^-%z|3*_=_M z@_rN%2ZI_`i)C1mnq0X1fNJ}4#sw|Smt{+Eqx9pE@1?{);F!9T9I?sFloFZhVrAZxhC5mj|jhHMKz=iZsX-)&?itHl>f>alP)B&l9@+VXb!@OfNlr<R)otRrmaWmHP9-*wtYiCHJx^k&=i&RK|Zew{hPXA`j$=bB>a)9rV`neYZBv-$wog6)W zvSn}CY0u56tyw@=HxMEA0;BG`x{75fxaZZ@g62MGDxr$Y%A98E-BV-(P;%>ph?QN5ym+3IvWf5hwCWg{zGltKzfDHjN6M(lI#+gOWo3g5 zX@~Eh={C2we|}35c=P5B-MfsFTP`jls{UxFi3%Gtb93hhPd%6a>czj2OhlImiS zc7OaY=o>h5N?9HS5W(L_iq$3&4LsJpg*$wbN!cB@HJQif=Zzd41y*_z^BWtL=9V;{ zx5%-vu|dqpXzdXhGTXCFuMZFX2IJ(<$*m?soJJr@jph1*rnU^!kMc`=F`o@snK*TC z6Vn7;#IxB8$(5hTH2{mR^P3bTe_7ElQMKNDzOa?VkljKR)r0$iPtTyZ7a;s)%@0U2 zDci5C8sdSJLlv9S9r%VKYSUw8;mz73&6n!F&idSK6H)qi1tZ6Ng}-=_eJU}h5ohXS z_`4JdD8IDqTtxZlr4g<~U}^Kjz&#dAtw8*p3Oz^rSlu)ez=)bo5e6KqES67*J=)N* zH+JZ7 z%@#^N=N~pkH*ZM_8^LrP9QdGXh^20JyKV1dA^kLahz*ZOZphQPytnx8sn&4=q9j~g z(@dfhK(0-72QCJrlq*YYmO$5y|7__)G|$5yd0OJOtr=sVEI^?rK5NMA0rXr;@Ppma zyT;DIX{Pi-ht$&ZVm{hsbjGZl?V7IGb7#M(0&$zh#uUK*I0w045M3G4C>^ku;8;#D zTKU(nUynRIlHRz_w{n6U2ht?;Ja_A7JXYShRyg2s#|GIdZ&wG>?>=~dP&=9(SqW-t zYBIZbPn&0TbyaWIdU)V%(ou(IVwCwCUb^K9HelmIK-P=^@_*qa#k^M+lEx2>{CGV+ zJk@k)zQxS>3kH3hX~Vw9f##jtSAxuLEz4G9Rj9Yt8Zm!NIz0!oP50l*iF-NoLJB-veA2GvMpGVo15Fr z%*-dSJTLFrgk5;sj4uk2+0}M5Kk!EME-?fZ=X_XDIl+=!Q6%n}W6d_lSxy+0qY zp^t!{II#n61$grJb&cM6Hpi`Y?KPOR%G^?n?)kCNQEAWlR!L{q@iK>=1kMzcv$OM? zw{Ph!>DX<2On)AY7=yEwp=%~KHiKP^V1x_!bc%GQ^0yTG<49k*$cvm`ZHTVV{MYs0 z|I!Fpx#Biv@SOiF@2XVvM7zAA`>Bzcjt(`jwP0)7GQg*x$E)c&IU*x3q(wxco12>@ zE62vhOs%Xky&eeYBsu;1+2=M{#kb_^bYb1$!cn0D{jwzg)JUvbJ$_$vy%fkMMaRuY@_Z}|68%CYDb-PHp^H)KE&Xl^LUw6nPv4eeW^Cq2qeE6Ot;ds6aJPVRJkouUD?^L9lo$n54>1#XAY0y=T(_V6 zBoX`;&fF&~FE39r8;Hslf6(}f{<1=q*OCUweQRrh0ilr>UO>b=2t62s1+Hbne8;V3 z!%lbP;Ax+bnT16y6lKIFWouw$#9(3LYaY&B zTJQyPg=u=8BJtFIa{Zh5bHJv$^?#EE$y@dlHp`h&GJU~Mqrj2*qaVx%ZAlawX7!Ag z+-=$@%xVO}zi}l=wUK~7eLcVS*21)11t1qcS`usz@ zA(&3b>0CN~0bvT7vK?s;;TKC*=Vk!grfgrh?N0*OJG1O6yY@vCX_fihIjx$W={na; z1^4^SU9SveAGQ(#;X|4nJj73+O2adpi45vMd_Qm~Qhqv73CSIWdnTPV47|=%ls7lu-mtT4|be2Ub zNBlJRAAreeZ~iAU_dXfbB<#c4u9VV#00KA_kkp9jl6WohDlH(J$LQK6!q&sl3uH*s z43yOByJ?ltWn;~nXMfY6_$--g77?ftKSsjwS^mbYP$xDG)cd+yD}XuI?*1(W=5LRI z(>+rB@uQ^WCqMr%brYqD3X%|JuU|mRcLVx~xb*QqOJD4w{We7NV}4^Y@VzJy!!h4X zN7aQp=9_@BKAQhr&ncQ+?~{Df!=Xos==MRqKkCA|dEK$*sW z;*z!ak{Vq^IWbU}Qqy+RnKmMqH2PV(4+}wR0dZm1U}(*PZf3U+jIXQmPpD!hS!VC!*TV{Ns){u znh087KbnR9TwVP!JIf_H(;5hZ^rT5xr_Rr7vDApw((j{& zTsB8b#O>N)%gYbZO{koLBdI9OQG@3~DOTr*5r6*?twCzo3{9B!09ad&qfD>Z-M)k&FB zV{-K?CHn(7kPS~i8*)3ydjo{(39Wa_FXI6(!?qn^Llp@E>AS9lvh?3<>yDm z#l=akUc7j*6@Ri7I$cOdmM?sgd#VU38};XNGh^3a>LdG87z%X99xeQh1g953IbIvg zn)v88_)Z=`aRR?u%gPdJYHG})g>PAzv42^93WQ+F_J@BglFTrqSxk|po4b1PG|gmV z+W(n~ii%}}$K1?)`*Pd1cLP!#*MEiltCqXB5o@`*xe&XUiLI>>kX(IzeQo-;Ha7vJ zYY-GHtE=hD$93<*XZY}cJKL0NiIGB2w-KqEJ?WBL6O}~({Lnw@cq?X6a%GT7~aKh5ogk7{wSED}rzfkD+IYn}R-)5NWYKe_>Bds8;!alcW-bUNpc&-6;>#LM3M z=d$li3NwLv(fQx%g}EOYi4eUtDR8X(Q+yw3fa#rQneGa;5KwR@0=emY49q9~nH6(~ z9Nq$n8JH6}U!#+T+8rC>wPbDF-6a6WxouB>C(|Td`b3lDJfr8|0ulYjmrt%%EOI8o zHIh9-iq!bs6f0^Ltu!!1?m?+ej=PvaH%qHzZyNW74Uj4L?{21){&*K_A~<4x@803` znDMZ{-OW@l+A>^J7U?6mo%gjPOy{kpHJF(=AC#7?cg1 z01V9ltrI`wnIdNQMzX&ykh&e6HP!Bz7XmS&d&ybTMkCWR@}f#>cE~izBe%PXK;XF% zr4TE1X2O z4dCNB>UK@}!VbQb(n6Iv^b)<$s_INrEoDz@kE{LT`=U;})mx3V`;D)iOBzpIAuAEw z@D>zx7#r}nqfoi+DiLuRzs|p}&+-M@oV&17D-7-;=rR@VQ)Z~hBlP7RHe5u4Ls#Su zG3-Q%QpqA9mE;6%ib}R(Gysy7Y8zfW>S9#16em=MACR554a zbMD?76?o4oi!f3De5GOI^{N>LE()4n}JL zaVsn?j@d`5?z-IF#sAo*?`DJU6;S z7pSP*Bj7L96-4Q&R==J{(P^QYam>+BOO1J3M95d~V4a!x4R)xlUv4i5GEE&GpC9o) zs!U2^KqQ!4)x7$q|2R5`4XwbDB=P#W1|LEkz2I||;K{h!EQoGi-6eT{ZJlXqSot$S zea~s@n6>6;5)imPRQA3rWwTt|)xZrSY{_iD8e6H-VJRar=cs{><=h2v0!P(1#JJbz zCUiq2FtGRT;Vn(UQzrwpJM#BIOmQ&ipFK2cHoAcw}^RQGvx^pvFc?Rl;~eA ziD-3rK7A>}zD0RaZkWvtqgfa=|*Qhy&xHvMFlY{xz{QrQp`HqJr#x9@Y`zI{6Ye~fCeLH%d( zp~x}yW+7roQ|ZU192G#GTi-AM43wYW);!?K(I1rR^vE0eV|4yaS}6M}n*)SK%I@yn zF3}>GhJ%Ac0TI?DT#}F5+YW|bbCrmN*v}}18w>Ng53p>NB*G! zK)4kX(+|kHTWIj1u<&IRHRG?W1>DO|mF@33aH~>t!qtLZ^pyny{v66X^mLxnDd
    nN|Ff`=yZ<#@^1{Zfx%o1R}ve5!j zmZa5(vY;zCD&xy<^E7(Vcy^CjZK*>J*{o+tDOJr}ZhH}@gndkGve z&xkS+ATp-XO0`PY7=(tZT~_o+xZd%rUu>$@g;8-nK%lO!j))OO0EFmH-ZJq0two?L ziO5U8r;SaCKZGbBC+PxX34i&Zu(@;l*jQTVA4YgGL&SAz!3v5CA<*<+lQ-vH@9pOj zS|^+VY)F(An%_uRR^70n0$Cz6`bc1pRF#3bp%^FWI;o3HOnR?3YaB@2=88&&v}6bW zjS$^T{K&YUb2uc*p(_pj<@ou@wf`_#IhrBOM&C++OFE@aPu~Z82pFOn{U%m`>>>MJ z?GpMf3dp5OM3IO;Rd_5viekLIM^1;I|E#2@h8gt$&?>}hK%QhP_g$+`fcsQ{c2k7u zAMh(j(*U1dnC$tmRxhU`03_zIoM+caejR!0LIr`hFoMa#HI9xJd4e`luDCV{N}f)Dr&XdvbmOq(EaqMTJ|H@zJ6Ua9s1>)+xqw4mRku z0fqdpH)uoH9>1uwB5#V1(dA7LN4iACZ)!|VrOZbFCW-iK5{2qn#-HuZl|7xzDQF3+ z8VbwLsOTkejxY73GFY}599 zo2nL*{md zdiEeDYn=6S>oc>m^a17dr>U*&{K~KYEF7rs*ijYD!Y*{tJ`LvX1 z;TdP`aLumhCkuz}ZW^f$zn%LKBy8$q8Bi-WMv8QD?1Fdd9(Z?ORP%6|RRSv=!IYtm zNAtm%vd-sQ{C1WLvpkk(p+n9) zpjT4{=J*UJZbkN=c)2Ij`U+_ONbF?0EiABdj@oFxtp86U#PfRpA6o2MVc@ghBSftU zkfyKWFkK;swYfXF@14J0GJVsVm3@=!6W(zBQ53B8HduOor?czu@^EdZ4oEUU13*_L^qPOoiMVs z1XT@U;^4Flum3OAhT7iS_5Q&X5jKribNNWT6BSeS&U##z{f$^+7$ zuuHZtTf;(_Dm^_tA{X4z!C+2oz_qt5R^arwdQ~R#y^)%NwSO{$>^Xx=J1mx2ax5Qo zwTl5v0>onL!ML?SmO^65-J+)>hKk;Ez7s&r^7kT~9f`CJ?#whm$ILCN{da8vMZ8NH z1?$=$0uN=#IP(8bRYNWK&}$`8@s73i0Dx}*O+#c}p{j*0R3JWt%+=LH-X+2lP!>{e z{yBI~3ZgDwTV~Ue1{6E!{-C12XfpSx8YiPV*1mE;+^Xg|(1!C7wXU*{Y^H-mGTkI| z6<`U81eCWFy8OUqVDNz|QV)3X6t4}aM8tH-^{LKvLe_oyvrjail^~#-)+N~se+l)x zX$`$30Hi9rKdFk2%rTnX1eSZND9ZhGEYZa2>ZkJZj};Y0G~#+COj^ezZcliqwDRG; z8fTXuC*20x!2j4SgL5n&_`Dzs(DhmdMsP+;HW;*Oz7;#PQ;y?%6gZ%9Qf5RE;mY4- zm|yqDlTH>8VYgt3TZU((@B5NOWHqqkWFY;fD*v^DpzzCo(z|9VsB}`K3KrGFBy#+v z*T*OWThx#nr;4<0Az{@0h#eco~beSK0BYOV7wpNaDU47$O<=epj;(+nuq zV-0J!tX_RB4HZgcijDc5J8OnUAlgYnE*4XaliB$o+a|ky2}JIa@KW|iNWjy9yRaC2 z%32r{_SbGYPIN*_`}e7W{DBf5@xd(lw^pE@x)5a(*W<@7Gws?&r5rivSDM3;Qc{c{ z$DtfGz(J@$`u@5^MR0g}`Zj>EKbWk|viKpZU50oN39YRN90RP+S5GOjuJQxrwW7}U zh7hu?uSW&SvcP~ydJRQdrVY(3)dSjAk`S7JVML)4%CFZ-oj2h8Bz*|oB&iuWHI>SE zG`{}r<*=jk*^<$`gfmW$<8`?Tn2r4RmWxa&Jaiv}3Mvr^v8&y{G-Hh60X=YVBtWbs z@Pcyyr>_m%U-OA(S%T%4l*H!c<=uR3wj~AW!S1c}_4VobUp=LITP3P% zdyL9vz_0pmDS7_lNV;q)_q~OkDFsa*8}!rCkV`GWSC!$hutv&NfPTPyhV=kZ=b+sy z4*0Q$01bJ~2cn!G=znZ&ZA~z+rIc-!RGudLbkQhhjdtvXiKeV3pdoZ-$6LWAmSj;6 ziZaiblmInLovw@mxS2PZBshXPf`Ecuu8O>=Fs7)PYT>|*tZr&I1)M2Gj`Q}xw`vuY zS1mxe#iqS}4JuB(23Y2EL&IH5%M5RtGK07DzIsztWLF+VnsW+P9hODcL%xX_I6Cq3 z_ez>eRP{%4y_YO(Wl(C$@--rvr^Q|NlEE-6_TJ#}y1V@aa*u>n^CCTUxW9O2i|>jK z>jl|n!DTr|J^0XbX?CXYq|5T)5bFuzFxDx3l6i5 z{yk5_<6k3wu{I%NN+~3K&tDmJPy1~+JC+(X3sc%^C1oXh{l#Aq;l_k9N6mVBr`eH- zCrN}aMp9pulr*|F|JJwXmgfsEjzuiCzzA6~>L$)F`&(W?LDqBrdIS@)>DES)h-pc& z&a;X1@5a7*cJ9h#4Q;9Vk8@ULTo{_OPWe+;iSpKzX?klI9NV`5RjT;5cfiTrx;1Fc z3H>QQZWfwWo=u=fi{)xVP!C)Y zzZw>)o#-wzxB>dsg7M?WkMDHa(uGvuFQlIZ2u05%_-89e#TzGEgPmdj7QVieJz=O^ zK~}81+*Uhm3qC4TecpasOO5QzGX2H^euzP_0XicVzHE}^jSQA7`21-7EF7|h((f%_ zK~mjn#=vVCc9_q>&zq^Bx|=HcLrB@JCP_V?xww>eem56fQH)L{Tj+y$-V%Fb{1O)ONST7VzCc+8$f&PNWvWQLkP`V>YfZ?e{gpT6 z6!Xe(hlBN+%oWSJ1CkD#w(Ox?e~ki~s+gF9d6P1o6aHZ`*9Ib$IFkS}VU5EvG1=SQ z`c!G$V-PuQ0o?8V)hAknf7qpDTSH0ECgeUPKwIXERlhWI-+Gb<@J!CUBtJfY%xh|< zvY@pak94IqEmk-1sEWGBaX`%qzZy9fzw{!ol`wjnB&;PU7${jxDQ;fNmbvih=E{`% zB&2N`8xxcA6Cb6>u?q3vM123K9U7~$cg z+wbda*qMO=qLdw9J=r8Ni2`8gqMo6*J~VNIK}XZc#6&GML#B(Cq!`Tg8GgzCC7}sg zNA;9Fg|B=Ds5U6T3uR zfdAIZ&D3_97vfK@_z!9VLlXK2y1#m;Xv*fIqpU{7j9!>X%AsFG%?xv7#(b-e&4SC; zYGL~w)G<1XcW`vvuOn+CvoPdvLPMsS14QJb;Iv#H4QKk`@A9g!mfJ)+%lx7F=j5OM zwO^^u3B=$rA4v-%wavNx^A6i)gj1yaSgOPUmqZlWWqx^R%fGhrgS@O#UQCRSzHgsQ4AAfc&}wQMP|rJmuvtpz(Dn{K zpH2@YbKTS#X8mW&Pmwlnzx4z6i`fTuXv^suR&XFcujpXxugP032u4NqUu|@4Gx^CS^ah#K?oH%*5Ny(or@6(nkvw z^C@dU=jJ8Vm{6#{T;y29$om%l)Q@J#f}U>QUw93rq~#yz{5E)AdRNmaX7e%Imp%9= z$YG}$C1^4zou{pENV-EB=L z-9B_s0b|b_phxe{ophGyesf9m)m;(^YKH~u#|cK71-n7F?b|M6`7o}a)6C4b1y>7i zesZ%^-v!3MzuTkv`Dyy+n+x(2XLk*kLo3ff%E=PXCBhA#902|ABwj0uN5TLwd@^O7 z{M$6lBgV8l@CMMwZaOvzd`GYWIQq8>WA0=FfeYkF5+h1mnOZGKz4S>+jV^U8a5mJ7 zDe|h1cGnGnt~`H~^}Pw9Qw4_|!#AHtnS+i=vE5o2xX9x7isy$=M;08x*W|Weo=Qa-4jnefvh*^L+y7z9 zYy;ClKRFU;Ap8kFe9!Y~%4}Re)UFU}{TKGk$57Hoc zGuV9G&%d+yKn(BJJbLGyQ3jojDf~Us+y>}VSzaB7 zDARe-K69UbB2avOVCRH75NwZ4&e!>EKL5o7W)l%s^kI^-GvSXWowSd+7b6x$pq)sJ zw4CXgUyAz{aw=Ml$4`R2F(O=X*X{bD?v0)3t<59Wgl5`%3>yz+$A!d!g z`Oie z{8Or&?q=mtL1Xt@V`zgk^b~;}F?1MV9 z??q_v)#gV_HkL4a0l8c5lbjp{pVu2)q1o|!!yZ*$@ppmfB(qcS#ZhsJgTEBjX!1M> zd4*`%6=<*Mqu+#=YXr)DSX_yf^y-ATwFn9C1Wpo@U@(`{M;`LgSy{ei1#3l`CL5eh)w3jopZ5m=8+ zwrHH~NH0163c3gK1j@rtMjyqFXiRjndc`ccwpX`3 z7f%2C61I&KbaZIZz4H5ij?RPF!W4l`z)9ZeCX^HcxDYJz%Bc-iDKeDU!u3>2(s1Bn zDADJhlzhWt+o(U*s{+5_+Kw7|TK&!u!35@i*N{tqOP2T`e`^G3<%R^ShdORg?Q&B{ z=J!iSS<>DZRyDM(M=D(65@3*RK)fO_}5=S09Nue57oI8>jr6a=y>w zy_mAGYZr*#`43#ZBx%$yM9}CYcqt>TLYlAviDEHv^QUtt>J*Z?hUhQTET~3Z6`1UMl_Z1t#}y{77hNxpEhSU7 zZ?-i0Br*Sg82!E%JJvA0X=h2!lzvjaLQWE+3A7^_#wq_yr^BP&hEPp=ciSL~8H6tU zeTLAH_sx~>1o3yDtIbK2h(%*Wka6EMNPm-!>B)LA z?mHn}n^=|Uv!S%ce^Cbs#MA%Y+Th%GPyS$m=mkd9UCtce?>!v`BE@3#f3H|f=I7*$ zMn6W>5K#OXJPYkG`)B2LsXS#7QwLl9ABunOF9XWjl>U~9mqh(RV(6psH~G6vfOkXyENK@< zp#%Jrc_JPsKD>S8mp{5h2pVg3$GXW$$$BB~g#i&+GGzoQJy~598)Oz-7V_g?z}q_u zQ>ri*BMC@pc#3H42^(g^%u!^nN33EuRNQ9q4EPn0`9HAB1JP|n&Ltj_!eQS&@%MlD z^X}ktcM=S}@A1YJl&WU)>mM*=smLz=Qp}Pn{3%t4AytU31-JDFH+)Qu^3mG`_ZPd< z|G<9OW(-I2{qN`c9qGO$dBOjZ+Yvndu@U7>wisc9OU7@({O$$jw-kY9Z6-+SQiT_B ziU+>h`bg0bo3weVw0ZacD23EmO4Z#FlogeJi zd%G$?j;LN1hgc^bUD~OypiPK}(!PH5ANETy>}AqM`LeWkoDi4@AhvfLVzSBNu3`$; z!}U{=V3FiTwGYr(<@(~FqkQ5AsXPAh8bt=4Vm_+@L)`~*!gnn>zvYklyo2nd_Y~Wa zF*&M4nV?web|se=;aX$Y-R3Vg7G*VfibY_fNcaP|09=SMuayP$a9F877DO-PZv)ZW zH%L==NZ#asZ6(k6_VF-RIpw$~ebMHm!tgHC4Y@g;aP7WszC9oJiG=R;5hxZz zH-Dn;;^yL&N8-kDJ~!lMp{Z`8{!GG3Axx-rJh3=dZ(8J>7FlxpDsSO@{BsSs;1L1t?1 z!WKVv+>*dK=Z8ciFVu6@J9tb=Z5O_`7i0X(EZxB;%d5kW0 zwdedX?kxgWjs~*xNA_3E0EQZXYsO!kKs7pK33P0KuJ%KCJG^A4~WK4|Q&?5Q;qH2sf2BTH=(jw? zj71l&7I%axCh`@uNaj$Qtd{n5Yp;uX6iu#*E}* zYXsQ9=wDWJE9h*m`U2B$zl-{&P?vd6bsQv%T@K+sJvt?$Y4`v>%1%pu=q7`vx4adb ztj6(${Z`?m&_;ybXTf z9pElgi}1y0zI>Ux)n!qB#Z~E+?0HipQiaNS%6WlDAVp4@gz6pK{ejnSH+8y$g1!X7Clex>_PT!Wd zvTjrq6he0*oU+7fX^TGo^c*=M+TM)@?q=X&O{jMy$Ai1A!&w``EE>c~#wS|rz! z@tem%b8nANJbd+XLmN zAGc~GM9~1DQ0s=?w{%`Ysh(F%; z-fSr-DEeg-=sfQ(6jzlksPs8C%{m1<=r$GUmhqoUJUwtZF_@20&OxKa`qlk0u}_ful}TF(5Y$1sqWXrAybJqU&|HftHm9ho=J zaLAKvAN6Rj^vD%J>L!ED#bYO-scT>;qkCbYNvSl4k_{z&!TMdDu)R|mPmgF{vjauo z)31aC$6@wah&N`*s)xI-nuDZnxzDN8cI(;>jR`mSl5{GbSdXw>0~`JD#9i0vE?hkH z6o1Q7i?%WBPP7$u;BMaUa*kyXXoT5fS#sSc6fkd%8BP(>KQi6Q7Q=>-R zKn$B3cMdH8Wi!t9vF%p@e~3Uviyv7u*lB|N`1G_uECO~u3BEkCf_8eZ)p4qdJL{vO z>)B2haUs&$9(Tz;j=4*1y^R|A?2+y)%CW~$Zxor!H+bjq5k4q}XlX*L8d7p=X`U2A z8{@eS*X*!bOlBMMH&?q;H|A(APdD#pKh7SZF20ZQbp)XG$o;UBY}O zcXWQ%(*cT19`_I_zVD2s6fk|u|26%rQypi*T#@fDv?)joQ+_hYO*RKl`Y~wr>^KY8 z5s1vVKQQ;^$?@Or5t8#&AgL2(t+__6I9zt&vT}AZY&!aEnyc5D7AEAPb+kofzB?`r z3+kBa*3tbua27mvK4S1J;TWY{C2Yv;7_<50+5A3jS`-5e?g>x*#7V4cMZs&CDm+VvGld80S)QV+$# zXJubJERPVVi6I?8jfBtp8em;eAoTTAnP==>_T-Nr-we#NNC*KM6wE3YNp#?1zkYWUnQsG+`yD-K5I;ld)b}e7Da&#fut1QKRZU{hq*kuOu>^1GtVvb0jJu(6mYh1BUn#sgz!7E`--V?%76*)kFu1%ikafn5E0 z=ifhW--V*VLln3ax0|Z*&-Qgi8RrBrm3)_K^!WZf)poJWD5u zR(qRtL5r;WCm2>;jWEUX{W3iCy_39~1$VfCEzud7*zn6nzUlr5e!xbz>Kh z9yb=K11rdJrV|x5<2PeuJ1tOIUhV6X=6XwSMF*g@$H(?9#_)2Ud)7kJp%)@+`eKOaI4F0ARVe5|Bl-&pKSkoS8}mDCyQCt2BJWurZOpDtL2try_mL@VUqs zJhflEyOk7@O{=)T^VgRe=v-l_oA(ZKFP|E`Ddi8HZHg- zHKt3`O!x-Jf{-w6k7$ZU#|_|r9lN`CbIL~&#qjM(FVjmIkt@%f{`XOl57*J!TZ^-C zBR|_NF1ysWew+kk7L^Ky9xi2!H%`JwKJM#)bN_LBrhO_ek3GFB93fzX?~}+; z;}802%wQ|~`qKSIseI<^O8{oN77i;82T{&hg#$y+#U63mj%l{LKnEl%a&T2vLhJ4h9NRK4 zNW{rQ{IpkOnB`r>(?i4Bnsy@Q@iPCGr{9@zbrk55^=FD15@4Af;o5Wh=UWf`X?-76 z`$1~;=ByJLFOQS23$WCI=)=)a@mRYg$Gu~vkW-rMtxLbAZ$F8YPNmSysJ5e_&2b8W zRFWGuQ$Q{|38>*OIUP!S+7Y&+S5Eg$BjYCL0iTZK8(`0*&&W{{dj>t%-_Vt7rUM=0lobybtnUsuOp8B^rOQ#$k<#3c?TgVwp z#I5T-?H>KS>q$Y!qzwXyVq7)j{8vyj@o`YqRVjz4Pbjy6BKEM4=MB)WTJV}o@HguX zc=}j4ZhfZaxY;TrxorjNP->fAXihAdsigq3~xuqKA#QSHyf! zj7rF8m^^gbhwZyh$SlQ?^#FH#v|tZ!m*P>C(omsu6z*c{^jhf~*cFBkd9|;`e>%R2 z+DuZBMB2zYisL?#T8eH+bAa>z`2T{9h+szcB;Z%H!{Z{f*fP9she9;+Lu;)gpyCv% zlJzU*5sN9wPxiH>SRI^-CxwaC1GT`npDA;Fw>~rM;)#8a3Ia>y=N5(^huNV$Xos9c z_d0K8G}Z6bxIF%VG=s1E1{NibuD6}_g8K^xdT~_EuZ#TPZ_~E!hdl1tPSn5yyDVN> z`9Sg`76P98YsJPyK0xOCqLtnkxTAJVpaK4S9uh8&_ckNqbrA?vNf)7coy?++E9ltJxaqt)mBd68~f z$5I%J|Eu0J*LM#!b6t(K!|*&F$6${>0cSCArTe<169>rg7D$3Sp7FmY*%U7Zi3AM} zNziA=1}IJ@@&C^(z=gqvl8<7$V0ohmA?-vvom{JUPr!&K& zTM5q57HGh|xe{Kl4T#^U4hQ~*xoGjlLof!;>hz@~KUL@03Iw6252yNp6=?yu!w!36 zh;I9Q>Y|K%ti2CZES}L(^sijtpa}NMj(P8D6d3w8iCtEamH*UUULZ8bHdrVl6OpD4 zw6NYo-D5|0R_<@hCqGv&&%WEio4ZdZ$p$5Ttyt^^vByjycQb)~84jtP6Itw8cSZLB zjUWiD-E1sAyFBIhUe|`V`zuYtyZFF5)ua*s0_`3-5I{5^+QWeUyWPz29H%mSwj8{< zEq{sU#ACA^pzD>3}ivZ#2?M&mFS7b4p?aIOBNdF!zaub)dgw^aCtdXMY0y( z8s^*fW46V9vvK zkv&3b@4QtG9niv`Sun;AZxmh8b!MDW{m1p+vzw{c2?1)19j8(y*i^p*6hVW1%JLT?F7G$%FEDtErWr!7IEqp= z-ae<>QOp(H8%%kJ9Exgozk24Vq-V;T)y7bTvi4mx&+9 zlX@ZLmWGg!wZq&56khGjXdKLWS#fGP)`AgyYDrPa&A7CGgEu94wrPu~H>lJ-jKQC> z0`M_80@v2M5Uvody^4k56R0W6Q&T(a{@~3#oY7*d)iNOtvO<^$6xo*fU?6HHy3+tJ zshNNEA}dB2zUqKFMVty#hbYSUHT@d+!-4niV}cWCzOwz+sbX^{Z%+@PjC=K8>PmRF zB-Sv}7tV<%ab-8^1fSz)U0UQkRUiA;fA#CXZO8O&JEsOe`!k~&`shXtwT?NSn@-vm ztF}q=J(+`wTH8YFoLfY{8hl+?#}%%BN8q;wWr7x(D__qTav^nR>$%_WuNn*caV&*4 zzc|SFrlTfIi1y(P;0c|yGR3bTuPJ0KOOkG4I0+ZkK*G@m10#7l_#*c)iUIt|$M7(e zHGMe*qS#1Q7m0+K3^TTlJ|csw6^b<-dWquJTh`J+uJz6h ze0-MZ2{;D~K5{5;!PQ#nsPlOS3hkq}FdATZvKg6i z%VSR^_I8`5T;*?IN9y$2L4_h_;225(l*%ELPEvw)^|94Jmh5n?fb-kS-WT-?G<;9d zlzcys>~_{zUqv+VGX{EUuREad&$QGZ(E22Ni*?-fmM4vVnVgLty=?YQkwGfZ>FNpp}o@H!a5Gz?INp3DD$kd%?SC9~O@`XRWhJ?!e} zaFzYaS|mx+>3!;rS%PdJ}Nw%{#J)Jo2=!t zeifdKn>xU2C}r$*j(G@BB7p7hh9_1|cT{|~ z?bg=heDo*+y+H5;4H3-;>1GL}(v+=r$rgUh(7*QQX53#$Sq@zEszsKEr}*4Lc4OlB z(8~+jae1mr@3!L|z%Pfof0O5UvBUa>z#34pSCbjj<<<#Z0R4k}`+b5H*?*;!Ij_xo zGDjuHd346<&9@AQ8A8+&pY);x-QnON6jyxydY`E$`qY|e?~W%P?sZ4nJbVJ~nA!Z6 zp1K6j!N<8f{5U_Ay(0C~B`8w$8wia&4UQ-gcmpB;_7q|P>+(t-(5K7y*aMoZh()8{ z@R`V}d2XLc%QdZu^8k-mQzaIIL@`WfySxr8E3H0R(j!B|+Hl_+%qim<3q^YeODW?c zwX%F}OCwXGWB1D42BrW@$K%smxaLxZyYNy?kRRDN!fO+rA`n0iA`{y#A&3;z_Ymn_*bzogtfCY~D(Djf!)%ZVVWGxQj-+4W+3VbAa$ z#L$-6bkh|3mNIQocs&&KW;VA619BO-ORyinbHJUKsg@YEWN2QW)B3T*7pz`BW@KUy zxe_Xr-$?f@+ARD><6-i!mG0{d&Y~tg6D~<^%6y_VVXCADU0I)KAjKH90{_`S3&Wm1 z58b$(`Q!aK|M9kfKICC$sdrd#Nk-G24h|MzlN5NY-I%rUzi&Jon~Rg>>xiwFBn@ps zPA@XEAyoPPyHn@zl9xe7>?CZV&lKD31>3{Qk7L43*%t}HV!~qXZ^cg3phZfdwznY2 zpAX0QT6S#r&ArMsd0EUw=MhQ6e2`Hh=pN`-?+{tHW9V?H(S>IX#*iyXZmxt-g?0h)@=p-{bn$%~I9bL^h5iJL=5<76QMA z5J4KRl`icd%hIsbkcMMZv=z4%9i1j5rxCfdq6I|!$>L96n!R+0pWb>h(?)n>v^IM! z!JD9jev{xsA(K!gHtRaME{yUYhn`qpYj@!Y1#Q(pRsu4j(7J93=P&P@SVJ}WybZU- zjMl32;vP|nS8#@?aitM9yukZS zE6*d2yp<;uN<`2K{pNXV0xOS2^>FFhH-vu(o=Law2?=41SI_LD95pfq56_nYmq)6C z6=g2bqX9=X8IGi$B@UZ}bhB6sy|FEo{5qUvs5~>ZX3kOTsi@DHcxtMC14Zxi#$6sR ztnMfP2cR|+;MYnIAs0`K{~pV!Qw)}%QusJ3n0_|RF3OhF#P2N?T-?_8evtwtmf~5( ziz0gIC7}M`$OY1RYPgN)M8+rNB(`lQsD#lDRL$9sO_2w%y1hDGC!Ldn^=iWU6*>5wtC!3h!Hr>7|S16X{16>UIGiF z<~S4p@S}2?+F`G}#0*!q{Nv+lYSWG5k^W>1=~B`d7CMh8CLXzcCpl`!Hsym`al6YY zK!UC(oJBjK@zBU(~k(zlC) z9%;Phz-}|{3sFKbgEWar+rYm|`n_~!8TzC8>1mhrRqi9}KO*F#HB$$!1hq#a0&F(X zL|*KU*%2uY!P10b=qt-+iHWX zPXw-p^F*KT=W0!*x8M1IS$Z0+-3i{g(Q7U8gCdR`{TRz>8m0;wMcAh8uf0s5eqSq(aRMF6fzf96Ngs z2{dzDu|x3_C7GC=fr*ta`6!@m1#6>F&C)=mt$;J@TW>AXrClTQ`la1$M?H#%)dyLz z(hY?AAjCtc1GTCw*@bAGLKHu!(5e*1&4VULWk-zSK5pKWd!Dkv(`@K=q>Gwb_@R`t zFLgt=if6^g7AI-~6MSM^4CA+8Rsv$&r$}Mq)UxjR2 zUmsBiHL)CV5DD2F+wutRDn|Q7Z2C?{?2BZ%DN_g%n&a?W){2rPjF9__T?4(Fgu6AkN|$br!;LC`+w<& zwQ0mNXg!Oiwv0_^79V)H0TC-l0|R!fB|dLjNvS4~KFlR+LhM#ikw*An`yRmQj>aX@ z4!3>pCVEvwbT9anaTi#OQtHbef;`z@=orEqqBTq6IbGWNQ3h?u8)*KGXCbV`A96z- z{NILcJqZn*jn$#P>}2UQ+mWp7q_cUlGkQG%|FXGr*qo>~b0rbRGpP}tN~M{WSZpuJ z2d=+0(q&r_I<^XyCB~q4)yi#PtgB1-;TZ?wZ8PPr@v&&2OJGZ^K5NpbnUOCCopATR4`-F}oo0^;$+16dq zCx`|vvYs!3ixAEb*S7i@FCKGu&Z=%JZf8#K`+vL1e%{nM z?9Gs@3S)mgTp>sc__yE`l$1bgb0NfyBXu&}@n{%+k*APao{_&Vr6|g7Z%l$dU#X@a zK+f{Yi@?jj2QrHqO(=TEMZt4#6Q_(IibvkteX42{MkU@;MT^I;myW52jV&n4iyXTg z-jg?+o(`v20=$<5;&Q}m%QZKcMCgOV0XEy6HhAfpPq`7=dnZk-4F~^EfBNx{V}CS_ zUgr}%F`eKDZ%V=>dvj>>IiJIzBkgbc%;Nm;%4C)S%fZ5$nRIP8sjPDRkGI#m0eUtl ztsw%PqYA>VeGr{3T?jXb)ScA+5+@Qeu(6XmbuE2Pph*p_)~c&E^gi;DJ4KMqf3z;I z>YF^rllzGgz@28QR!AZiYKg6O^~caN z@cAZLC8b+!OYf)QFr!D^qH5!I7AXD{cq`U-5ixZ?>Jk;$7wBf z-HKE~i_^cc4a<%M7NqXyJ@V4ZPVD(HnBni=sNZbA-}!y;P0YsZq_DQ2!ZHn)Zs^PLN4svRW^hE5Cx5gy8P(?)KrK zj(Z#Ioc4acGtHOJi2P?|b8#10Zf8h1`fT^T)GwXB#Sxi|*fS&O0avkT8gmz;51Nup zUu-Dc(We6UouRua=AY8gKX)(Nz26uU8jfT3F$MT9X){P4<6@FAA>8k8Hse3lG4Y$Y z7l|7+tgpD3+RYxo-n3kA^hROf;J`3kcXYkZC~zf=zTaZsdbdecF5t+Y;>Z!J<80GL zC#z(}-Vq)enHOgJW^H*b&^36~^18eCFre-qQB_BHaPV`6n2OFX0pX%|VdqybnF>Wz zbqnwgEl~YMW8(HI(<aQ&gKiGqVGsylta8jGd|ePJEd)EV0Po6QHMRs z*K!+E{3W%ek>%&hJHfzy>Yy(eIj)xbT1Sl(SuAx1_b2ph*iLOP2qlB3CQ&YzPIY3M z_JmQ^?%^%HH&9F^3v8sMS4^MMKS7wO(y^PHE0b^uIY}Y1(-tZhGCbp$+1VmLhxcwF=>^iPMASFcn+j3&>j~FQB_=tI{noh0rlQiyxy}~!{wW!3XKHM z^#uNnhSSAH3C6LCal(q9FfDIN^9I^N|8{;Juo#^CSr&IT*FO%2qGog9Z1+O2fy71*f_j%_BuHo>ihrTSLR%kU2)%g6+=Hi-+bW^qlk_be;=R4l@cK+tiPxda}(n)r6HMtNmSCu z48Ap+xqF5`^cfZXj5#gf>^Bdbb(?ap)wK4}_1+^9+nssH+4DU=`72@7FNkQRKGAh9 zZNt_gwua78|BjA-f(c3be#(diAgP9-eq7(F&Y&&hXaC4F&zao6M8!($j@i{2vSxVA zz0o|l!FT&G0ZqT0KHds7b4s+Dz)1d)w6zJV^aVDcL#W!8ZM=0n9Dz(k-+|oGWatwC zPQPgXp z>J zs|2+x4(z+op`r1?A-Ctq+tTxU80`InmcBVo?CfH`y=8OX$OjW&X+X@h8cB5&*tB$V znj#gHlQp4Y*U*?UGWp`g7O~$Y{+q`W-_ng+<;M@@_f(3PM=tKj>FKU0gn??>!~%xB zt2lTe9g~x_J`BQh-&@4={aUq;;G_zzg`n&>SE1?OC`ft18(Ab@D?HtxLC8bSLKFgj zj=1I}=R4^SZIw;7K6>#HzR-Yb{)zTCEoo;&4(V5us81yxe&-ts&X1dQQdONXO%%?A zZ|X-rba>+m+7P*?==Sma#;a5eQ*twV5_$BvX=`xmD;YZGCW>7fB72%oC2o$1`90j< zu~&2r({?>%>31k7mOYrSL_r$Rsf4`q%$b3(4ztb7D zJ#z%TR$l~{>%Eh4!;cqyLzf;1dC|dJ z?-)=U&^Vze<$a&DPUf9yHA(fmL=-YzsHO~Vp;4uZQS1vb(FhqXpZv=%u2xY^G7BMd zNQfaSTF9EIT-Ov^#C$RR2P~|@#k!18ou=QnTX@CY-Ep=HVt2LH#6CMDaovh~Lxf?Y zxAv~6gn<^_W?U4heoNr1rt`f@;}AvUA%3PkCeS)VDkMkXDk4i(g@fs5 zP-(YVwLIGbMBhR7#q#!W)zq{61^U&ku3cS^4|pxn>;*f-GO$?Gt&cH>ZTxbaMZczQ zo<+cU?3k{1Th2eK)>&mMw>R{4-%|65|fr3Igsw@ z^<8gIRq@<9ANlGtZSjjxx6it3zmX>MXC6TY{q4Lg(ErzRda;r@iw%FU@JH#1-dkp$ z=ZB;YHRP37lLO{lbb}vC5>X%hu#xl#ZtbSk+SU={WLF!Ul3_jzyUB+D5jbfPSBlR~ z4IW@0O9{7@MshQD(0T(B=}wfSn-R8ZxseO5TI7EF@bY+F=SUO<;2*9~+b>0vUu)mX zPX1nOO6VLFEO6apsSXW|ec*#dZnwP;x|K@~+-dhyCR^y(AtI${qE89LsXJ}@)V3kt z^$Zs173Semv_s?Li8(oEDJ-H<3l@1i?7u?OnrP%DGOaF5EF2waSDKT~hf}z(2k7uc zM9$JYA-$FIS6OX{VevH0gzCxucbLokt7~b#dF<$YYY2(3gSkiK(OSiAufzu_D1GPA ze|&8B6!Zg8zZTSDotsl_U5qRAKhMVN(Sy+|#yHfLtB$R%3Z*g+3c4 zyWT*Cdft(*T#EnCFFi;MOZ)OIeJTpCfNw{F$mj(xo&^prxg6+y4yt?!>FI5`2^ zz3DJtH^(z@Aa_(c|0tI!xC`#6-jE>Xb4Q)zN~PV2FnF%jh99HjjP)5h8)~r7q%H1i z<3)IWNDLyAY=4v#X%?z~RLZ8%tW+m8K6-|>YRil#gnc0;y$vS{IhmrpmCcBxrRgg@ zLmbC!!Y%*2^BahWV4Zo|(TY}Xp(@NL;%aqhD03iiw8_jVzu%7}NVYx@kr$q=%dKAdr~2{e;MzFU*hV;A<7r%OWdwwIU=xR|~4 z2i2V3j4$ey)UuRE^9M{$KRQhhxjcI!J3NNehcMP6e~81cDVzx#r?-K9+vHYt`OHJDRx0`+~^ehP6zLq`_;j z6vnibh1#7Lm)+#!()i%=TDcW9bp}8)qL{BS<|^r+zo?IwdzxvCG8)K@tW7&spj($a z3btr{^B+a^6jS_y`HA$?BKM;lL@DvVhy))wBQVverHF{XP~^oa53wsi3I;+JmS-Pm zo((9~Jv9YJBa4lVQ*Y0uBVTTyEvl}jJ%4rkuM{V}0cX{Rai#!hc=TFdNy20eD&XMn zUhC}AZn?BL#&?Em6Fj*=9aNJljD;y#wJZ}YD$Vy?;E+-+$^SO}!J@K~MyD!xg59#u z?VJ`^tJ`P9>M>uo3m<GVdop+grC%I#6+uDXKww#XVxnm5T9KvX62dHXE-$jzd zm1;}&hyFqy9m+N6smEziaB;t{_;Bj;VBFbMYx6+kde*Fq(d@G}jQa+lKF^eVNxIKAq3X_1NJ zwTUW;PWS%hG`2EsIgLF=7$zRZG0sRhDQhCYYyhc)G^$_(BqaOK+vVpiYlvs!8&fDg zC5G@|iYC=A_9N9aQd;9psRg7teIBlttivnSPYDD` zGtM@~N!=3v{OyV>)fGJh-||@V0%3f17T4Vy`*e$SD}x{PQvB~$hYKYqBO^xy&*rri z0`@`2a5}eigggiWF%$IKw`>hxdVt^*L3(mvupTeja@)2Qay$E{3pDZaw!WxC{5L`f z(RttH6t?!1q>u?bj!^`gx@Avl#M)v%+1Wi%57F-lsjaiLT zIFuTE1gHD^m8q$fZubf@vD2YpSNWak1vYxtNdS-QZ8+9MpqMY1(Hs9sWk5vN8c-VJXNEN9kRx@sSq_l4=OdgsybPOW_&WB3{lh zcifzyi>CBZKy`M$9OS&)e0OR%5LGOp^+{Khl#J{)F$&F72JmCRw@g5dxkRZ1IuS+S zExX%Jf(uO{LzcWkMl6yoTm8|a`g0w7n$wTox!5m%S4&B5wO9AhF7&1a4XxaWMw;Gy z?Qv$hzn=V4MEiIyeZJxz)@XPKk=WFW?yw>IM%#fIHtKqX4-v>()3jyh))RGn8K-B@{VQWkf4awXe>i$wjy`TgVZWSGs)V3^sdXkXww(QW zQ4h0Xu7uFMiRnotBb1bzOMWQsJ+Ivk2p=Er3w~jDA;eHr+UQfxv7noHE-h#H!3gNn zNeZa;Oz{7+ZfiQmLzwkJc}YTO@K~){*%kd*rN&$Kx7Gp)iB_j2IOZ1}ZCI3eQ88b` zC_cLH$!7npE*HuQ0+t*H^`TOH*SrCh^Gz{jr2-|j#Fy1Fb+Q+Hu$EY4*~D#@scl+r zA7tz`UH|-NGV<#lsiOmq@QNhXSsELLZ)5h`0ZdGqmFA$z;gmu-sceTA?+-U81*=at z9Y&p&y9%B?;T@)n>U<%n-TD>Qu@V%YcckfMHf_&*3|G#)Ai^UE2fGYO^G$t^^6kW7 zT$xLJxCD7e{6icB}gUgnn@;!F$;(4J)ni~6HqSrYMAjQKX2!gt0i zqA)WEM$rld%c;CjK#QTdZijz);ezxj;DNPnR-0kp2_s84iz7nvQP5+DWM*pVX6?a9 z7%GrI?)_?~$_**kemg&JYK>BpR>{evL5+NZ&b+F1My*XrQ+pE$`8G%S+Y#sW11?-u#^1k#Aa`--}=n z*Hc8K;pm1U7S5H-d;VoMYWHa7!7tL#oXXTaO3btAg;4N~C`WO?-1?WUF)fDcXWf9R=Ta z+7}XHcVdDDk^P5;D7Em|dukr-$_Ora;T z7<3V1naSE-&@PHQJx=`4K2z9F0m`CoI^Mz}I3Ha(W5@c!#HN~k3>dq-_u*QQZI>dL zHDo8S*SD7 zE}r&IxZ}Tte_0SeGe|%q*yS#l8qpx2X=%1gg+V5N!pPEJkk+YI>lU>~{VmQF0mdw8I_)<2DGZk$V`$RO}(;3%Vm4(|JY7`9^g;Y z;(k?nHLyKgaX5=M=!PncTeXFRf+sqX9K)*E8p^W3 zC5Mhq0^&ogZ^GxxeGP??~0^$Q(78UAU#P{Tt1NK)B?CsjTvQ^8>6K>euKUatA4g zIlQK*quDw~fhWG+m`Hp2J$zgTr1VSA7z5odvz;l_9||=(rrkI`;oWXiP*srViFyD3 z@$`-1ku_brgV?rhn-klZ*yhCcgcIAw#I|kQ6WjK;^IY#azxrqI?p?LktyN`It^w-K z>o4ecc6(6evlVzv6Hy{eB%VBD0JVHPc68+}RQ72?t)RY6gZ_eE(dGnQO${sxLgPy-RylTPDes zBWRhgyD?VoYT;bCB3oNz)5kv$o_Dbue@YhoU1=Pw@!_X=z8Bhh{?dPZ5fC>5N#Bev zOGOSMkliWf)53l_QHkzt{M5gf-el)Ly(-4M?7~D}?+uyI<_tsQPmuk~K?q1Jk3U|K z`O!1Q0#}Pc2?K~p#VR&zoS z8Z729YrOtO?`75XUqtTz-Z@g<^bmVd$5*}IlMtyZY;S{Sn{G9fFT{$=4vc{gkeqdP zrFss84--?Sa5%3h&h4-jxLqUd27|BIP;mJB18Y&!Mft$(B8~L2AMD^FupIVYR-S6y zMzs7HAECPRuOsKl>*961+=h-!&15G%iE(P&XQu9>f6=@-Y_v9e})B`M;) z{TL7Gy2O{x3g`^p`9o#2s5@PdFOPu~Mn?)Nq)+Jf@=o&9{?N3LM}DPL6w_`33%qa$ z6B9MK!>Cd8tl7sU1s>J%KIiLQF|^8eIENI@%6hK|U&ODsR!hk30+Tg=MltXh(%Y*& z7~ktX^k{z|fQ+TGbUR-gttHYFN(@2ru-kNPl{d<}oP=d7D<)8y*}nA^ytQ3I#WD_Z|vgV=IhF^^XyWzI@|s$6AvYA~7`hdakAwALsRX_aKn#}J1u{?R>JkjSZUv{mD`5@Wx*%P`jmGMt z8mlWRo+@5GY60WUKNx3}vurh8EkkEs8pQ~(v?8*9&F?O4&KEt-&ko|&*3(5raZjBe zh`hfL?34;)TMfyBW@b{^e&)jwL#fsOK8>~N;JAc+BL~!i^&p?x}jILbOE zWOWijFk$YOYZ`vc{Ar>GoaQR|wVxGz7LQdP>}(hzS?@f{C+zZxNk88P!b0ZQ>1@UB zrRak)Ga^rubbh&U7Z?|K{(Al1!Sh6FJirv>2D{_(v3XTp{9{FjG&@=%d1f5hHw#|u zr&anFc{PONJ2FuRY;maI4BK_IF-we4f}8*c@O`JW!R{+?^(o7*yjb@?h{QRPyd&iV z`MPcM@NMzk(|RF;r!gG?cmrHjE1sawf~D?mL-}PVHN02-?ZHX?Vz>NVMqZp0M9kEl zgeh-3NIak6o?V_>aD_C#b)&4#j@b5suL2%dbmJbEAV%+H&B4_gx-0h~U>>ysZiS;x ze`hL!Xa1J*i(q|D_$q5;UZ|MmQcbYf(Ow`>j#>B|qk}f-1=mZ-A-;*^BNmghTn=6SN6Xg1;K#LDa>CI$HXbrdcr_8|&kL&&sj%H{)Sp&xte-AxyXXz27 z+>w;z15ir&Qg=%y-1;Vi*Ty@R*0M@YFRE_)NCZ58OX(MGW8)z$t*Q(z)F{?cbI(%> zSiU!u(GIVeL4{C(3Mj9? zcUEl!b~k){tQ}b>@qMI?qB2Q3?$AJk;SJ|bEV~pDW4I>v&Z_K?@j|mUaJFw}67|(@+bZXu?D^3$^4>y*-^QA}KZ^`Ud3>GEp87U67z> zyk|hiZ2Bv&o93)DKHE;9ZFNgKb%{m5bgG))hQAWeHQ_z&`NnuDN!UnqZe$2Jk`OT_ z48filn5Qz8mrUSUylw{Hm>gH_@1rY>uX^I9%bDLy6Swz!v6tE><-HTuQD#zTq>Qr2@|sk4`YCD z%-37HlYouCrh9t{FfWD}trZs~%yHPwwaj!k2kU6nBuF@$X%Rc8%M@uW-GAp}J5A{! z>D#6Av+#>`>9Ti^>r+`Zg!Ad_9-)8y=yH?`Nsvl8+#U<;5oVEu<-hv~mFS4GpfSGm zHXD=4u>6V!s$J@ef~IFRhn@Lac@*sR;TMvD%4edYZ)-CdkEXQ5ZAk&96aDg*NL7%NQ{#a zB9fd&tdEBHB3iYzTxE*tI)SY$Sho=8da(X9*G_NH2&(d zYOZA4IJjONS91$flSof)`D7p8Ft?Q8~qI3;TAXA)}TQtBC$nV}%;5=$Do;-*8KWj&BXp!Ia!x<+GjV zCM;NtC@AuYDH7^Bc*f0c@q2p)6d&oqIe!rhB`AYtkirWzK7$LAQjCf5$+=3s3M}kdBrw`xtR6udNwtRKlr*25|3Wzf0 ze$|M~mYeNP9<|IO*9D;Q?xyS9VX3Cs*j^iZ6J{-YmZ^ds32~-bNSq3(Fsh3?qEL?n z2|7EWRSMMX2gu+)d|jzR5wX`W5|ht6J_gLt`kL9}-OuF4rw2|+Ye7m1x6*Y@m>bN2 zHsqV8c(+iX;_G|dI`2DYXp-+^JqSfhRv%8|{=r+HHZ=y|A~(?e@**P1d{)`k{2tq1 zsxaB%#)lBMJ%x7hguj*sEqPiqXZ`l{?>5WU46yNBJ`rAwa2dOOU*oH~>A2W`KDDYzq$RgJ(+Fn|q&B|U=V;?X_EKe| zbT~+zfgnZF+Mg6U*Lot-F`gZ*N6*mas=c0(^yd74E>e_Zv>D{cw7F+t&P?_z=&31` zc@;C7m_YIHAa31h?I_|spU&HTNBr@gBNVR_NpNQmQViYEllzmNg;-2)xd`50P$A$a zA(0|<^^M=$D2{Rp<7#&#Qt0o*f`V5ylVELH)XIO(C<{GZ;%HzxhVoLT`B5{?_p%^g zgFL^xu@l(im!*A~eNZB+6-$c_p=122ZO`;v$Kq&iemFZjT~v@KCpW03bpvWO|6?(& zauvgufgoJ`=;e@RoUzv7hxLWOgB|0M=>(BWmDu_2Z@>(?wYL|vu+}ckJqKPoo-P%Z zm(!o>Gp#3HnoQ;^&aqXyP}0eUN`M0VNcnIv8cgr6U_e?34?OtZeVmm+welMzcEc+v z*5TmmR5(uPEI2+v{ofT%Rh5~w3Qo)5GUK=TJ5QfykBNdT!`H|wp4nGS9h)2b@+`|& zWF}3`(?Xek=VO$x$#hc79Z`FiGeLA@p%s77e6I)GM6VYEUgxr6OcmPUA;e;J;I*eR zVnxvKwI%})5T*;_0&(n&4ZT@dhxWjyLyTAzCu+|?5f8kwaJo?)6d*#oCYQHdYFqG> z=z4b)aY=pG4FvYOTO*B}aFrclfmymwy=@|scxF4>3a(9ykXOfOlEty;nw>-Pl{ubH z*VNy3zN;fw2k|m`xEA$t!`xwMn3B`@YegDwcBtK-{QA0u_=5o+E% zi9>bY*n-I!a(42dZGyH9!YXz0Sg4S!6}juS7N?AN^R&5P0`9vb2^ET?%~-_C%c3>Z z38&Qv>zUqY6iN}t;>F`l3SF3)K?*RDm`R6>3SXwn(Hr?Td8%}x1WNn=&g!?_XqWb? zX@uoeAqeSzH4Y5KfP#|YM@P}4pb<_>i{K;2HZR6=!yIx5BP7q{xe>)>JGf7m9_|NYOyTDZ@#b}@F1UgjsUnj99)Ar!bX-9O=#LX~_^?LR; z*vJ^foqJrj#3qrENN>v(gEc$_?Rl%)dv^fIq&K{fHy@Iel-9mi_5`ilxov|-93mfaG=x(lQ#1j@-<5iijS^DR6+VGQrteQa zuL8q?(7P9_V(%SZLs(6EyzVdyd18dN@{m5dL{4Z5T_vK9V7LHjuBLUz{sK_f&=k3` zvITGzZfMfDR`<)Bm=hYP@V-iO(7dWIV=N4I)HK)->Wlh!X2}iEUe(j_?hVfC{7^p| zqq(^kDYuR*S*pysT)a2!c#b%9e|rg7>Ck*|-s7`rulBuUY!($;&!)0f3l3RXRb@QV zG$3acuuz2oWY50w;zw4Z=zWWmQKe>Z&nh-h$IOD@#mxg&X z|Kv$wC(QWHZRVW?mt7AN(VV{c{Cv5ojp%uEz#7zGu_~ZQ0Rr6o-Q!^XdN_$|(-{{&HF%EZg zcJ#aeh1~=_br@9<(Kgt7*Awe1+Al&HV8-g|7*1S{!^4NMaUb`|ND0Sq0w?E5J-xWa z#_M+te}J9_7cNT7pYa0eTP-udXMa0StHCnGFgqD6A`s(=@hfH`;`CRQvg(p-#u(C@ z5_WVY1_L;Vrfx;@*8X!E&Rb2u=zFjI^f`ihKZ##^B?)RPFJu_#t!e9JaOA_z_K2 zTc`hH;v))xv>2TZJjkiC|K=Mi4GOAC%#o?XX9mFsT{imxdQ1VsCt&mA%afmTDcCZNL`(~!`eUx8P~g(&8! zir9(MK9Tm8wpOp>vIEwSPA6}Fh4lH(J&%xr*4A1Zn(xFZ>l;Lmb^>y%;U(qdcG)CR za&Drb*`rzN0x0?YOKcXJqKoGo&6tzntihDZ=uLdayc2_I58RiXUjbF|}mvi!! zE%S0rRE-tK0Iqjnm_eU*2qk`RH=4zgEl?~fl`Vq<5rFu|?SH59^x?VSFUD3v>R%n- z$?1%={w=RhGWB?X*w%UE_4pfEcbBO%r}Mkw1OGLlj}HNW6qHb~B)6vXQ1E!efM*;Z ziUB01fSJ+K!g17MGN5l@iPKA(noLO()etcxrQE1gRszy^>%eJsqdZe}EqEv1AAJDa zc%^bEk9^5v*N9Q!2rJ9@igD-mCAq$dQtPhxqh~$?5R@}*?|6jTgRbw2E0+O@akKJu zqGQSHj%e-y&0jeTXeMg6rvmEUd~O~OvDj4++n2wfqVS6@AMHpiSQ7)l^?B_{_Rpw~ z^hJg9BeUbFPIfoj6v_751;&$`&sjdWQTj17Y0(zx6dkG3b1#40vfHfHMiC_CS*@Tj z>*^Zm=%f#iWgjq48tk7sUbsw^^U~mdD@K>TLna4L&UHIs*k^J8^MD(0T^0}K!(B?M ziaJwR4V{QEu|3tfA2{!y7=!bn_X%>rkws4X`rt_)6dP+VAe9tv6t>Xvq@EV7M`f0k zAY!MA)^9qoW*9KHffv~NV(nL|KpelJL$S7g^7dO6z=homKojQmsJ&O2wc{wE>W+b> zFhc6UTMR<{6Lod4r@Rm}z+5dw%4%e9ZHrXB5big`uYbcLsWCx#HQqQ)N=B5Nvc=?s zm%7zUecY>~;o20C`mLfiCK?W{6~UGM{~ZCZm=EM}UgXT>dv#Qmm0H^3 zV`HCcxXF@yot@pT)w1LSzNSn>VInQD7LWK)T%;Q8#5P&vdP;sDj zkkIv0oKAXRVJs0K53@5J#xiOBf7;vCaFGU=f8#`onoN}r=TYB1k4ZBn(Gq*CU$7Hb zfQ(t)Ej14H8${y@UTmJ=GbK(6@gKOKJMlrihm6Ohxpm&WJgz(~ z)P*Jl&CMVI`u`_pwf5v`{<=D}-TJ_KlPn~J)%(A zqCD3J8>4&Y2lUHR?A~4L!spQ8BAGkntaU9@;_COXypfm#4ZguTwArR~rxe_2?j*xkIKPVqkQsRMsoSw~ZY>-)cM?%ivLpTop`!=0@!?rp z;jE-(jWRJ)jt*sW5C?3}7F}#8P_aU4>gWLGg}kdr&h5SE^pI5e3KF8**xxXKH9WFa z8Qgc7c(!o+14MszU_Mo~B4IcE@)S<)1MY+@x|Td7{v)&dFFTQoL+@Si1*7Fcnf-*) z;4gX3OY%uPq0Pou%}+0C2pgE_6jdV7zJp1EOo$)dDg3l}InzdT=Paddnl{E5Ye zmg`dddxI(A;EOnT(&vDa+${d`13>t+f z-}$|)L}+~J=(Lx3te7o{7iw^NMEB!ra20#=l*^pg*=ihpFf)g}Tx zrdiiystr`El_u9ZyXwR3o#}xAMR#7Xv%8bRo*PC+GeZLk2&fiI{*<^lO3BgD)!U6B z=z>Oc#g||oIs^a+t}CnD&S&s!)*|XxtDknJkAypIoqCp&W8I7(%7@#mON`?QI3F^0 zg{)}G^d?Zptb91Q^bGp3!C9*@yTbZi`O$E-hcvZu@Ik+YvnDNGing+2{Ij#2v(*-j z9v;D#IZ{*_ek1lxr`4PyC5f)}{?2*Ao^3Waw9A3dSEjQd?4boTdnyB;HSc|#lz@fo zf_g7Y@56`h4JP%YwXce{^b5ZS5b=^^emRn@d!6E&@q3w&rF7B~zdXD^{A~Bw?BI;l zg7RsDAp9d>4bHn+Tq0GByasiLyJp6)`P-*AGnv<_tsy32k?k9SK+l?u1LnG`Bdn%| z8w3*Mv22FC%kl56fQMmt7!r?Ou&~~X5I6>mrmOB?j5~Lr*Ap=^mvRP(2;(6GJGGbT_aehs zu!Q(!ey?Wtsj|bAR;b#i)zsU#wy_&EW671162iunF~4<^tQ5;@#}hibGaH#KL zm6Iu8ch?B}S5H@M4v)AROZ9pK$jpb!Ri^DoHD7jRWtTc%k(}u}ekiD6aHcgTTiguv zMR)unJc35gJzm2czuZ!T^F%7{f8$mUn}Y?qJCquBbs_*9`$A&)Y`oL}VR^r1GPW09^_eDj8_<_{%6#wofj@eeyXz z>L8}!A2nc;3+-G)wP>>c=2E@|^tuMaxg!cFc=k|L0f7g{9wj#Guoe%Ox%F1S?FK@5 znPu~}&#S~Fdnh;knbc$W3d81?vK@sBhoz!|s&&PAXSHcvM;jD*&;AKLVE_xl*kblC zi>3Lt^KBWsHA#uGh$GEAvG&MR{g?e$+zc+JztaTIEe~EsCN+4~zr`3eRkf^G%s9lmKZcwRlqag{(Ohgqe z2FQ@kua)c!k=m27q;<((kcBE48uWflsq?!Lm` zzGpfs)3&YV z_CoQpaLdcp9~h0_upTPcZsr65VfM2J?xaPMms3RW8-5tco#FwVhMa{oekxv`5Ax~v zz3kc1!A6w#`5P7Ox>m__w#6!%%4&lPEqi=oZbWDf3scz4Ukat4eSaw&AtB?9O_fm4 zL>1JQv|n7+HDYhYzL>~hZKBqbx&ycBS%B+P>Z{;+Ig!(1Tc~m5&{megnmUQ&;5k!e zUOYu-Y=?{5t5%Zjq}k?_!5GgLHoaf+Lb!r1r)y;eIHveRTR1pnDf4kZ$`C#Y2l zoJm@D>3$2t0@_;nS_}jdjA*a+sMLt0nwJ|Z(A1f_48JIO7EO)hsKBbLKMiGM*ro7C zVdIaphN`h!>X{FO=RA(7Upt=>*BF^aP_`&jn3BA=^!rHv-EJEfg zI5wHU8lxIJUJ#9M=$GO+h#YQJ^GoCS#0u=zLPBU|d069qs%Rt$v`ENl)xL{IxI1PW+b~Hkmkl^`> z*#5RXH|Uk>Tw-J65-D)!A=an5#@Pz;$?R3lfvyf?aWv)w#(prnjtC&RZ<_V~(`Q=v zX1YQOh!dTaJBQ!=E$Ih6gE1?ZH`t}$!A z*1Kmb#>WOto}B8VNyIKL?BlFKFapc@)f|g|`$N8|l13Me??imJ{Y7<$!W0_*w7$n#+reyV>hgT%knpuRo&K#Qm& zRO2r21Kn0Ipy8v#mUlq?Y;3KcsMGKKLbcWK)51z~oUPgSnl$KMWnlD^&C*I&%H~i`#FdjLFsMHxs_nI_XnA+zZ z3ma2qf$=W^I{}b>a67Uy7<4AOUxYi~{>akz46XzNRpantI;)UPrt~&P)4;}ilS{LJ zlM_dyJrzz?_Asw9NzD3ac4lg`$?tst7Agv_-4egK^hCS99t8jg?QM8!%0NiV?Nah~NZpB@Jo) zriWkgiU4WTAcgSvbt_d(ZnOxT^{A!faF1Wh;HKnoh0yloN#XyH-k;>pyS(q9G=W7s zz^eb`e*~0)lx;|lPzsz1fA=iVo@w*{NF}ARo#dG5Sx?LjP>I?shj)z=b%+ul-ch8n z_g$F(Acvw1yjqN?L~(wz97yFkyXQ-AKy-@#j760Tup?a~|EhJQmV-VzT8S%F z|EccehT3ZA?7(%#QZy^OV{$amw~vf@#Eg{CX5&%0_g@@N7tx4`#@%r+SIh+lf3o1; z5F$`U3V|dDKxK_+)CGZ15%B$DJNzJ}ghGA54O3C1ExShc-@XmK5r5GZ6nwHTu4hgB z4sO6|1W^KahOb`Ng3R0;ABeodz~ioZe_G5 z+CU|Xj53}OmIjNF{hwiJ8^)CIZ5Xⅆ$u-kSc6_;XPr*1myn71D^qjJz?9$ja%mv z8S(?!gL|Y6o7;>HKw)TtTDjH)~;P>ma(|p`thpb3v|V>(6(QdArl` zvy^!|v{_9VQ|&%3@6Ky(z`z~jXUyt1MPr_JAp+t8LV0bCb<$n}@)TA7rP|&YP|#w1 z4iSJ@rM&AS#LD5}b;Z$$$+)z^LKnG0$Q_;~D8LiHR+wlTi~fU|37h@J;$kH6LCR8Z ztubrD%hG=^!RWkMiNUZ@J7G70L-TGu3JouPe&V6r zbY`xB5-`@|*;m@TFMfKP?#xF3=HjK>(!r;JaV0Z&<5*i%N(<9g7)_DzzldZ!DvIJS zf{-=sRtg?GX>(x^b40{^uQ%BJvlV^TIeeflL@zH?L}mfDLTFA?GLY1?Z?Gw#AgOEf z$QCZ>vF`%P@pIn!Oo3!7L7n|PgNqt8A4JoTfBR3fl*NA^h}Vjw)(T2Kp~nZ8YsF&mzl^I^}26>+VjC50Os=YKpBr%3sH>nhXM zq=UtiuI*goe}5-Y#9)A5;}Yr`Eu<*Ra_isCa)jfB!I>NXa*`9wA6RAlkzTS zVF{i5iQk|9fc_Ra1{KYViG@c0)IHjGJs40>;wzP$&cp8x71ZD`>!isJ^^_pSckD!bU?j^Sg4sXG-VYZ13QVf1+uAV$#J7R)&2fkFC= z)T7wL5$pQ9H#As+QhG~#8!?gu?`w9kjFNlveQuGZoW^ov$ut-hqYqKdTua*8j@a9aXTIsUl~j-CODuKVksUl-_&g%BVt=E@4U|zLd}6Um zhA1cR`bXdWx&bChlpR}|e8?RGIZjr^ZbDy5R|%ZbgW4c$qc63Kw!+!4bF?ovJK2tJ zFY$JcKv!Sa=e{crgrr!;S-*WB-mFdPsbZLYM8OoP z<_+SkvpF0*$S-U=HQQbM9r_JzAq7DSz&)IR>$EVU0<3Tu=skNBIXDA#M@l)}^{6?s zd})No>T!(yRLj4P>ay0hC3e@VKg6SaosjkHzfjC2con(_N>*$inaDAB47i_{QmF^&^=|*ljx-HGqmVew>RKZB4F9%WV-f?ArArlW3@dPuo@`1tB?8ku2ZD-paY=zSPYE_ln#u~}Fsq^rUi_ySh8AG<+hQSn5P2V_RR5^I`KFA8 z>r}KK?ydZJh`2ybK{iLKMS^qS+u`|HTWQbwLe;7D2nyW{ikrbb1N1{6O&!)+;1Lmp z?N2~ZTPtK@8r(9v9Gc{p|eq3Z!wyaWMfB zt-zP}tJc2M+@Tcb^Kaf6&+#vulr&IeiLZk@&E5nonLDtcN-K}@?d?()fEl1qN98OsvS z9SYkdU+=Idq@@p}ah4dl78FyHzP?;gGiwTDH9cBiSa+X-R=lJgR1K248J$M0TD;ST~UY)EL*ZCS%be1Q|_9-uPOn) zUzNs&+G?%@$ak@=mgsJ+&Ux+L8Lw|I!<(J>p&Aj!AKq$DFL~!Hs9utiZi6z-ns4|( zDM2G$tcU?bHGq64iD^PYfvN_8=V0=JsSLtFAW1|lo0276ug1fLYamHmnIO*oOgCDG z)|RI(J**bAf$K#Rh!d!KV-;@&>3kHpc*tHoG055IB!KuPW0gA9R|q-T=$;A09sxme z2dE>mYa9YVcL7I+vuEf=8K3*H?pp!dGE*<`v!%(g*h#k82c;o5mog$u7#mj`uU#cE zK`IR>A2ZXpub8Ask!_tv&!hO9L;0Kl2Ik@6%x^HzJ7N0~l+s zEGfC?Fcg^0$M^Oen9wwAJpOlI^;yXgifGF0kHM+A&3MIROwf}7ftxLc!<)f|!tDUy zfvs^@XBMhy%|*`FhJ>kqe!_9{0k_YNuvu%OT$uzAKtg?A%mM zF8^}+{>d#bor4==AXwS`ToEKIbcliQ5U_Lo(=uV2Te%yQ$P9Tcl$*(GVrno#zhLe! z7$s{aUHCAurG4GRHL?FGr*pB25mAB!N;bHEd1?YUYPt8lg_KiQq)PfOGIydVb1E8~ zb=xN!x^c?UeSnQ-6|pAiXv2b^^_YAy={c!1o@pRB3%Qia%SV|CB_{_6Bz@ikFp`y8 zgtm)x_AVEqS)Ks^8y{Y8z!fOyQB(enjqq|}+ccIHe7981YTYtq7%5On>a&?;RH-Xx z=j!@o9~cb2AjcbMvI}9vR!{!cJv|bxGRw{AY|e0ir5v+32CtkstfZk18oCRhO(W zQzq^fmHXi}u2e-RcyA~|DH+`9yl**+G)j^!hhLt=a8x44Tn)hJ>)idmLxMjNAH(~{ z`E#b&$Hq~TKhfMWgRx@u0aGP)c4kT^2LlOyePb}A9o}m9cfEfXm4H*fK?T4mIP5{J zrC~($^-%%?yMO8uJJ@vt0U;XqJK^;LXr%gNJ7oosysM3#^}DYE2tXQRZB47J4DY>L zs%W;_>SiP6fg|}B!dJg#TvGt_8th?fO8|@#Dhl^Ajs_JFFq)LsXc>G~ zR}AD;JT@bhv1JEtd^G9x82p8O*`HbW9X;WKD+|8smB-$Qb`D6NZu@Czgt8Y6{fHCm zGh=w85tq)qcaEdJ!1?IO=(Kf1Md7QUyRLr%*3(F82x~q{A#ke;73Mx~I1oU*oLO6k zYUt2<=+tMypy2aJY|9{H9F2~1m}Fa+D)~vKFGBPnSu(gHH2E+z8R%NHAEg#0PrdkNE8W?mLV;!a9Y4_yF;nY!?g28KfeeS54dvKFuVxI_ROU2OZr zR^I)NN}Oa!Cv}+GT{m|I8WF^`Itl#(#w<*aru6ki@|YAD9_rgn$6t0$wV-OifUP%6 zFF$*ji|;{c5V78g`pYj)B~GP#p6jd+jI@BUV@Wfwn!w|_DX&((2uuwC;Chz=R16MK zOoOWHZto?VacuM^E~l_@Sb>5`prw`RfF}*mAR?&FPjF5D~=) z6)xQ-C*5Xho%dz2>ol&NPf0BMp(KfkWuQrsXtF$!fJVY-JG-wNg_spHIj|0n$=?U> z;vw|~PFrTDb{!FjEEZI|E|;b-H(>&SODb{_CU(dU%daj@0w{XNbeE?(S3CS)?#>2s zm=7Vsp^f$m60}CzlsnpzEa?j$WpjXi3?lH?W$}#w1q3|qa_zQLn|^37ng0r;gf0X4 z5&i(ZhX{M#$v`H@gBMeQQtL;jk7;*DZEk&t7aJN$@M?0zBiRPZqfXgc~s7-Ep zB>^pP=W<)AxlLii;cn_YzMut^9S(HaN~X86I{!>tp@W|m`9mIFO+|%hg-D4=qOKMk zK$s6P3EgFqEy5b>6Xu=atWFQ)z&b;>hladHZX~REq|9 zjV4m?@&ci;yk9P2=jRHQN=W^mtHS2%3DD+Dz{dNuOeW5!TdUbCIrTqeukYW6BjEYE zqO+prCL)72z5ld0kcZW|^DSd`U+5P;w~cF;JziYylVtMiJUyW1O)DdxOv2|djMt4d z*Hf-fwN@5G+rTRQ96$<~{sBDoSB0;Drlatennw4)d~f)lXaF!uhCm;&_ylv|qH_te(0ylM-R6)u@U0A}mI>txeDler`7y3&rn{-j5={l;zW z!1%6;E$hM@Acy^pGL+Czz$#LdwWFH94I7SVHpK>a0LQtDgfZjvb)Gl_8dEtsD^Pcd zjWIFF7*ARxrl=wqL$zA^glB~xZ2=Klp{Y6N^76;lR^-|`ZeZ(C?bvX!s+h_<_BO}$ zrE-z4F$-2omBTkgbwp`P&WOg<&bJWNpLl?DtH}S7o-PB#J(CwNTQAP-=1yDVd(vqS zhUsPqA*N=ByLyh^s_V=AhaP9XDM@Z!iDb7^Dz`cX;n}7f(=KF3tDq0YB6&dqBXHH+ zChP#Mm)CISz8+ei!R2S?2ZkeEd$u6+eVFMM`%k>(f!EUV=n5tG8_+EU6~rNoeTE@o z)HL(X=O9&HoRb?ZqttIqobFRD1rR{XI^|>h3@mq50cWhqv!+tP>~m5^XF=zq#`G>X z;tw$}ylDRp(WRyZ<2yDCxhjENjmM8?j+)=Pn{=q=PLZk`)t^0&jR1nNzL22)J#D+p zP1x4O28qVXjXUttTa6=SMrcK@JT~{w-BXa;pQSam1>rJ%%{ey75y^86aspt~LTi8p zQ%Pt+g&v6iH~@1;WQbBkl$CzKe>vUXuK-A|`eqO!Ir@!CEjC2eb;9$-$j1{I3Qk8_ z2Vu-%$Wg(0H6UevyzcT%*4)5sK(>@DTY3C&6&4T1>Tn!`?R<$nY$YKb>B03{SX};o zf&n3Wjg^Hdmm9b8l!KW` zJK7g4|nOK8r#6i4El4`^SctZFoDtn4zfgK&Bm#L>Cl>d;X1o3 zvz?u3QqngsNut6a7<(7?xa__Cu(lj(`^BKbhalJ(jk$9y%KxnzR3Ik_$P>b(O4Tl7 zI#y#A4bBEYGVXJ70?3dcfSl^MehmiZA0E>zhbbubX0;Jt?FQmg_w{oAm4B7N-0VhG zH%??|7SSgIOflJas77IQG2OW6c?0DLmgJHXLkYhI-vCJPPqPtun0yM;#wET^VPy)4 zLwlclJK7!jYk0qS|6Htm(PzMdiPvcr-3mp3@(-YdfECG|->8qsR@C1pWG%VJmCjW? zS-2|+N~F+x3lzhp0OpllfwK$uwo#Lh`A@NH;Z5ruP}QD){Z9;c0HBLMT)f9e{CJun zyET<+0GJBJ`jLPwPUy0TXZ;1$>O?9T;LHpl@cgnR}o zLf1RL%Y;IwMnO+dNESQ@+@iD|PwL^?caC)i2T6N1XFJSMdfHCY3?Tg0uI({9T!37G zR^cR>?4}lmjKtAmj?6n4(fexKf`A1JCP|z}0R@}GFsB@SRe3}*lLq=*C8IU_d|Wz5d% z(q zA3Jsh3kyTqG6snfOBNwL^Ye9O0^dnY=CklT$p2eINxlW_hnVeNER4;Jp9+2DbEkQ8gf*4PE8$;7D@}ZT0o;TH5l9jgl~B{&MxV zfo+)vRD(9!_T@$w-S(#QV1Z9?I|%oxr9GX=D+bJlisZjNweyq0Ma14)J zPwjL*%<0T)w^z=t-ky%ZFi=Y&F!~bEQBaA7hf+w)iA&#->7dY~%Xhm5U?gEgK-qkD27&p;JO(2)+2J@T?IL&aqUbU2@xd`a|1x%~sIsm-`EbLT<#4PFWIY|7vWhki z+D2QPn&_Io91OrVP4MG;$c*P&Pg6sdL7HGwI5(W{8p_Gx;U5GqzD8RYtnOF;X3ir< z1j}*@I>+bzkzGMoy?L`gO4;;evWBkBLXUovqQjtTd%7nTg>Udstn($T7WcK%a^Z4g z991nGGqLD{<(B1#_t_%>B4l^W;8s0;*YF4CdX^@rrcICxeo$HS`j)Pw-?oTV(!RoS3Lk94HF8s}fZ`?wD&MdyGTgXK4WFGg?`9hY2R8(zr1gX512VFP3+)@8+ zBY&Hb*`e6+@xE=M7~ae5RBlnpgZRs{tZoSYIe7*aSX&B~-z-fp`|Yj4NvyIl5e z+drI{aBb)=c~1f_q>08|9{ysxmLyrhZgT zTJ<~Vq$J5{Rr{Z?C62dM*wMXuU9LRn4?Ya6{b)?C{b$p>+}54Bi`Fj1n)usL|Gw^G z>#6X-E$1lE?t9mF;A@M+RjxC~FSc^8|NI@R(3)ZoJQ!8ZdKi*%{K#Af{`l@jQ}&p| z1qpsjBjc^Ls;`R!upI0*ef4(43o|n!R~~U73m^{Weoo`PiwM}W8MO1XRXx_$zhyI- zifkrE3O}}Vr*HoGkhNq`v}UBARFZ!X`-B?yhdp^+s82Xv@4mC0yImv5@R|UYD-(TV zrx$KgMf!mNF?UqYIezy>mLS19>10*6+Xd6$NJ)pIs@Fo2MRxK;yw3rK=dUwx&N+vW zmI*E*_gCEQlG74lCQHqu22f^#L#y#Y(+VNN*O&YLmzs;NP8WJE9J~iC4=2|1KjRDP z=Vz35yA$BoK_$CFF2wetTYl=HigVEoZk?1-soWZz~x!Xs?Ia^VWU^5^I0!&GS1qF zt07?DU~x|QhH}n@K0jhMh#!gHB`Q@YO4?M)Mzs4&cpTJIT4RA{Hy@OdNHX%=nC=!} z&S8z6!;1R{#ma&I>sc-S-`7L@y?NWCuT9(Ca3H?VBmyPfdqkreqvaV2CyplGoXd<3Ig@R&P+90_43jQS2lx(PKc4hwjpt7CMwW;UMaCmF$r3H=G<-)dFE7-wG{ZeTu`kJdoY*(sYg`wX1(?~|2BMfjwqTMj= zTGesTE{dbSo#72dV9hG>tgXiN^-${(*|p6x!Ggbke%pr@U2?a0Jh(33FuRXWdbe-N zYmUk;#^JOp)#|M?Jkg>iY092C3?8zDw+6t4T?Q z)HSBT$LH=s>k@oS^0Uxi_|W}VkATslO!vf#i&r!;uOhX~)^s_YM~RA(4k}JiZM}ha zAKbIst0?$+yQ=wD2ankj!H&ns8#V3N?ZmIsARR3;*#_UF%d0YwMh zP*zP3v7OPrVocZb{>kR9htp=>ispD_|qNG}?O7MCpo*4b8?#XbX%D#fo4t zC9M!?GHcm;gA0k!^_2ryPY`owy`4V%-N1MZR>rNy-miaZ^&`?|4_A_N{4I`3JHqC+QCz}bkHsm*U-KUsvR}rRB z*KxQJ!iHF-IgdAgN}p~MN{4R7yge(->rW%V6M_kFI+@KKdV?NP*M&#eGwu*lFev`5 zeN2E01)SP!j!!gXoLa391DpK+T6^oTsG_e?nC@;#rKGz-kS=L~p}V`g1VKPRkQ7O! zq`P5&fssbR0VHMs$)S<1@A%&Py}$SS>-*#0dCoKE>=XN}y?U*^&)yq_#vHIe^)_L< zfqY!u? zj`1+O8#1isxF|1?c0kfKY(pKFao*=)R{FElbg#M>81@8?BuvhMgo?ITHjjipnw-SJ z8dvS+E`RBK3b&6_(A(nt!d}6izVPh?rh5Q%ky~WU8JXdqoTXx-DgFLE@zuzUF1^iY zu+}5a%#3`gv82RLI@iXU^E*^5VXb<05{e`b;$cp(r@qsi_S~?i2%-O$i8F89&KeO!Dr$6s8rCq`Rvy; zVXJzS4p#HBfTgq1fihYi1xJh-byJ#9Itp?3)l6DRds<%&Tg%EF<-&Ck$2M;gh{+|w zA?8fd@rC8*b!81C(9_KYX6+S!#1DI$@T`_8?W1Dwu@aE0A zJNL@SMbm*h*J=x5L_{6xBzrs|{JHOZ8T)nXpRG^owZBd2xs9#sMaj)!O-&})l(KRB|IuP7{5&Wj#5al#C9rznw*B?7 zcusxsH|ss5pT6=N??37bx55$+HInJy0Y|&x;-KM_t8#$|)(nqM2W5QzFMY%)H3=}! zXBvrt(T9(p=_N)JAIJ&f%KP7&5OQx8`f@dKSAPE{FuzF2Gj6J)9EfRoyU(555G;4J zGeeRNg9VS*iZL@VC`m~t`ul%c0*<=6D|(WN1t?OQ?ym$tNcc`=)A)PMnli?IXmz17b^1To!F-~A2r=3r^sL2xjV7w z9J;YkFnF9tMk*7{*YIy9i7J0K-YR&6t6QU_zVKjgN|k*fbu680=g7Nc8>m+wA^h4M z9odkR zjz07R@Thb&ri0g5FjDE+X{>Za%V9V}2V!28^_9a$7Ad_=cwvTYZQrK5|bKgQO1B9GHO?oNWK ztwQfe5=^nLPRSFlPO2+94m{jC1BYaY1-2(*4x%uZJYzfzDMTh7>TYbJP_Jq5)D-2mLiucLa^ru-zpby+^#O=}H6v=9} z)Du4*u3QBjF$U3Z25x}*!p-%^ExXae3^(5(;e}P&7e(|rJ!mt|+JB4G3=*m0KPeDD zUTm8!1T-at70OH;8uZ@ztFp*N)h4=$XZp@waPd{~g4IUAJqC?Z*7x0$4joJTw!u4s zl~Igcse_uTYi0GZlj!4c`>9B?u(x^3V;@c?sL+yaO2*8Tvx35nS!+dm2Ue97koNs~ zb5fGt8CxZe%#AoHu_+=%V%j0*^(&eHS2`35X}l=Q?9${s$z*LyURDOBlm%(E+B?{o zwBkX(?RHP>IcGZS5U-`j!E!N-TdGJWFtR}?g>K=g)cIBrS5W8u`jSCL4{c-sHpAsg zPA+K7$WnXs!0IWI*GqkuCP9Of?;9_o^f%gJm+XWA_V3{R6e`QQqw>1WqOPsTzKc0h zjUJ*pkj2Dib@D==OjIO0g-_^YV}n@Vbk7st(p++*Lxpp`=hZ z0=;BJ#a5On=KdCUA1=2WtYSoU@r}#f&~Sm9yV20lw!UVtt%lxA9e1oSj?vhTr?;}_ zWt_AW*(ewMa0MfcK5tHR#?<_M7?R4|tbKj8i4yuP>#yAD_Z=Egy!@g?5MPJMgyWu$ z5!LCE8UM!G%4GP#E{n*;)fGhNvZ0neZ3!O9Nd0=s5AIM_d_(5fd0`oR|9lI4)cf29 z8Z=q9rPXlm2)flitGjuL{}%Wp4WYnMSqMM1|L2&_|FVPzw9cc>0JB%9IUWFdb;+A9 zOlpE143F6824#r~s`V(GG^Q|Zl(&lOA-F&pvBIehS=su5^93ItpuNUAhx$}!x3~oW zF8Wppc+Lqi=xA{Yq~2a!Z!cmthS(494kF>^0?ocu;NL=RlWhfr|$S(1P-*jpi z%GFMMntr}P{@mr+!K&7Le?-n&jwRXzohzm|{M7(2(~x)BC2j6Tlg|cB!*khGeO~uN z1yRe%-5SwDa8Le4%+vmbUl9I4QB+;C9UC((*`=KA<}#_YJ+#YZGB2ZqMbjXMyIXTB z7LgttLVj=}wM_>(t(9PoQcQL6okBA*bQH_4z~Gs`u6 z0sPr~akIJV2C}uHYvt_1CarGNeJkBKMW4dkr!6f~+>lhuW^L=MZIoup!s{)WyZM%J%y&x?XT)~q zI4t-e!TYZ|&65wy1{e<1rN|=1T^iFTl|hk1HDLSb*CKi_(^;8?iPSa*z-1Wa=B=Nh z-p@a66&q5?tC%ov8soUkG}z-Tlc|lkCKWovM6SY@_{Cz~~B2na)etiE@U*sqtA%(v8`=@)|itZStThd|&&vQwG@**uBtb^O) zhs&*n=acm*OqUoFCWOf_GVVoijAv|sS0^d5Nef|m7du{q-vYq%!JeI@Uo=(gF;2#0+%->Ylb;y9hxsg=O4+_?*UsG8{gHh5x>v0x=N?b zg}O*wYp?f{i#GENrgb-J*Ght9<7TBE?eO+M?@OpHi*IiqV;;}vCkw>rLc~|q)}G6z zwhBl#>rc-Iu#u!TohcDdGwmX>Zah<4h_&4fUbfAS-T3m;jrlEr(uaVfUzm*U*&!WO z-$ebH63T;3VVK%DBu=dB;T zymxJNS8Jzlqp)UK!xkfeT#+a11scKj{wRpmpi_cwD7p!MVzVhq zpu)(jTDbe0_<=x9NuV=4?MGix3X0ry-gWaT9)~$DX7$7qB7CDyZeGJZlSXWgP!GRm zE-%XJ)ABRCa+VQL}H$E{6gHSVr2V2mQRc;M^0v-q2ElL%o>#8~M3-sbQTDe}=& zGL`=HbBJTc_>-vp?<`A&FS-_*Mw^^?M-SiIUNFjD2uyzt#j0-dJ=(mS3OnUmUX_2W zkJ5q<5kK!!C)_dy~oUF=FP7|cvYd?}v^au&>9VQAHpQOn!wriUks}-4-%tG*e$ToAb z7UX#yR$XWMQ^$_~}=HU%#L;5RrSjJd@BD&nwAPQ7D^(qdnqeqIsdTyVeF3uALuYA7Z&i z+kv-RU^-umD}e~s``0|Vi`$T?{+mZ|%n!bpunQ8p+z+fRi{2i~xoV(JF^@r^@zP7Zom&dY&QZw3s(I%_^xbBWUyr`YjO9Ujp|8%%F z_oMuI4K|Y{&Us|Ho9+kWU%m>CiLa2Ai_BqxzOrY_2IPcJ%R6LzAdh=sgC0PbV5a=5 zv{ZBlj#w!COd?*$`H5>uK!<1}CkqMT%suj$eai6$V;&ad={WgqD@>`o zYv=J`!$17I+J$HB4#ea06}bdQEfZMBRUG0Wuz+6^t+w}xE_(GlcODq6_P1tSdewfN zre4pBJ&^HhC@|jZeuATPwGty18D#B`az(gl$yYhy;Nc?1o!ksE$<#`f9u4BJ4Xj*n zL^AS~>m6o>Afv1F|=7BK~6E{&2<`iY180 zcA@>em=o0gODp>YKH$n{^SXs+aQE3SHhC{CchnyKIA8F{r^^>iIMUUG*e0^ghSVO2 z_TBB}pkB?D>ooRR@Ad!4OZs~u^!IlAN%)NmkR0L^{Lr6XptV{Q5#2)^MPuf3($izy z^{2zbyt6Kjdp^%j_w5(K)K=og{<0$UirYoF5V!<}8eYOdSTH2wPc$@eGQ_`7i$&SV zm%Z6p`Q5LPXI|MPHM!JX)pj&!sY1tBviAgawVq~$8dqVP)ETbRvLVhMafE_GW38?v z|61Sx`#~))g67vO%8%ka#GaJU`AJP(u#j8c6FC{>EvagWzRc)~g;)NglOHk4C_<)W z03G7aRVd@u%fB!D(PW7CGozH&_=iqUCeMl?xO`51^1D>ly8BtSPL&0M{Oicn9?0#M z^x0_R<_v|qLc!}lUA`y3dpGKk+R1);=FM69ptG|{lR5EM5n4aj1kzK91{oOq8Rdw? zS0y$s)lJZVmBCOEvWN{Z5#lJOB2(Y31;${|^PrFDk6wy~qlu%Yag~X=)jm&S7fTC$ zKyJ6d1E{l1{Z7N+*wvKG8~N$n?R1xKssDLK^MF~Jl9Ll0c`p$}$L)Jn;Rf@bjoQ!s zyL?2fbJMKWV3rcB?-p)bEwQAYJ}<8x4u2AxFpQNiOTZr)(4zSaB9Ah5RUWX`1*Z~rP&nE<(v8Qy zyC}(-h-hYGQqO9-v`OJUG=Xw5L}Q4ziG|;%(XSpIybkCK++__^lxq=7t^VN$ZnINd ztu#F45@b?7w!UuSwYZ}87f%m`(~)g}FsD1e@0z{u@vlj?s0a=DQk51NmHQ$kV&8=x z&eqAXB=1oItpP!}X*N3@F~2MzA$IK(N7V9V6Ne0)h7hBktO7;42~U*tXFD=@E6`$O z?&908$Xv+|FA-_DFIgYb@!oOI_qYi8L=XwHC0#oh8UZ2i>)%td zNK+9i2LTZ^uZEmHYo^{9%bVEyt&Nvfwoz8aX#+FGopIRY2GCn(*>F-IklgKODsant zKGK%%k@M^A>>j-Z&&TN^0@fQOerA&xh|#q>T^z@gRKP~F@Ke$%K9e#V{F)x(eerEd;t0oJBvHqE~DnuiE;t2m6A( zC1oRGcrH%Vd83HC9zrN+Qxq)z`hX%*uBDP!UNy17+D>j$uIz?{y0fjq9>|e)MO!-l z1oyR3@f4}|0Yl&DK(*OE$B^pJ?RU0fH9b2ua>9WPtGkEsw}XA0W?m`oL03{EZ1pP6 zBKp>qOt@ zGc-yb*!m_TX1s~&xVdiMo+ke3rqbKbUFp5P>D+o_3 zc*BE@$tWG-DQ%+7y-38(m=JK+Hk1Ul8E|PvG|)=aQ0*IS>~HvVo)+&h`#s^N6*O}= z7fmsGc|)t(j%_*-Om>m?G3{@m%%xT4h&ZO$tw=T1 zcaa?}t;Vw`+GXn}HPJU4{>nE>VM1f4Px%33Dz2J`?pn%K+)mu6-GZI(c2C~E8}aWU z^1Bd>84=1{q+KLy_I~`4^WFEi_Y!X~z2)*nZSX$zMxcqey_xqMO$QI@>0rSh)!!%a z(%}#WuwLcEtX+gnx6z_aj!egv^X`o~Y8BY}h4DI*edf3-=Ws2QQyZ z{kXDSP(g~)_B^QgQCWBUy^4T3jubSeQNJXhUD6L?Ku%qmMnQ_y9M7>P99Rc_9UN^r zZ|KQfGsd$FZPab+F|d&(|C99RGkznNxwzdnP#YiX_FT7wUxJ5?E4EM4A2Tl*07tA0kYY()KCu_I~W_CFuSfY}F;rOn1o)ZPAs%$LdYwAXhm zw~4MP%A9?^L-8(9Fkajje{D6_lodwYrz=C-kqHf%p04i$wMR6ZzeKOQtCqg8)x%@y zYopn+mBp&Iv8F-1euh`NLb2Dy60J!a5p7@eEgA{FUi2`bcg> zBJ|`-Rr*2t{KsIK(@@oUd9kZNR;R>D5 z%!(fuIO@VoBzdawfdiEpBawvl(*l#hE`#FO3?4onx}V1YICs7AZTfs(nw_)u3@{_y z;CPWVL9+Ytia#X%3h6|tiB%Z+X*TN8(#P0gjzUMyVDAl z#+fp$VN8vF_Py#z?m?oUf>vkk^Wh@F>6H5y&O5sY&*b>wub3p zk!_-cbsbdB$eG8v?A`Eu=pa%QOVHtE_>05NtBdae5_PCvhNN|at&Qj;vZop^hjGYQidE`%?b^R29-!zkyVMQ6 zqBvKT{F|uQ@DkgHAA<)Bxn5d|X`Q`$wJTCz)YPA%KZYtXvOk8kv%C9@o~X!Ao@LcN zzD3N5f!2nGz{!bxzfiF)G#w{l=nCk{?Cv(8<*EC2-mq5BU)(vbqg&^)x zERsS_yFSIZ%8Z%Ty~;|e-$)}wFVepv9BvRgrb~(zqA}X?JD`f)NFB8ZZz|P$NoqiX z$*e(IP~Jp_Idz{TW|G4jctOqkNt>0efd0fDX=*$)W}>ZkjA6y#Sf9j$Lhr^ z>cz(b2c+R^90T6&=9+7|;dy(ezx}ge1KU$XnUI{_Cy-;rP<43l6Lw8ZA2Ns{Nrlin zA6m!ylJ=Y6*Cf>P^l=SU9D#!p@2pwgD=IfY+YgkyNk)VYlk2f;U-i(Ym|IXdrOIhe z&5g2i1Hg#@3VAzys8Uh#sfsM9DL1_{0-C5@VYXef>AJrD)JG!v&){NMk4^ zFmNDKAM(y>no8}xh7)}ru~G2^u7fr{MW>(}|7`NGNS`-F6vtRJ|92+K$gFc}cXkSN zpsdT0Bu6p8l$0mPNyb7UIKQU$-daROpfpj*7GooAh-E3AW6k+}Ol9(FWwLY22Q_#0 zQG%2d9LBp2!|&$gklHsLxVeo=d4dpa9BdnUt~eG(x&1hwZ{9Lf5Ne0ITBQb(VGV+6 zA*9sSM!J+PLG`D-Za3DxS8dLAx*1;BO3_b~iuL2>zSH_G(R6#=UF^u>aK++~7$ymj z7>(7mTf?jd@t8T7@%$3K60(@39HrkdQ{-Cc`*WVhbQ5;itxs)^=S)xkn4g|Tykl+_ zeZTl1JKHK<^i7I&YyPQSe{!Tm1g3gLcm^XEb=eL5)7o(ih3r z-0Ec&3b(>}#-6Xoqi%>+jo-WmhM2u$1N-|CwND9L_It>*mFmr$nle3VBHNkrl-ZOtx&2KDf13navT(Nj4cw%>K-~VYb3DXp zD`a}7D>nlp8z%KN^EaV6t_)srrqFs?zvyIIFIm(BU~B7AkBatQuwRyZ8nP%6L}w=A zYkRo&d$!HHUBD@HzDzwN(qo{mOrO}F2FVC{`tLOEpC0O>VH(XpM?B3_bV8TQ09+S6 zPOMoLI7jG=uQd+P0&TTTkftVsCe{5>SoZ(A$zn>nd19|8 z3nrYWU;wN{MgqJE_$dVAF|z;ntw0$x!x|%s3FP9q z%Cx!4&@`$TG^!4E`3#yk%RCJT7YH_h;#wat1_ojc2%P zR5@r=6>*exW$8zj&B>O{!TfW$^D_0R%H~!ys>o0L>Wnoa^f=3YTC`n%&FnJgJkQnt z#`bV|cZfU9T{dT-Q3bm?S~Dq|`{th&R5jKDV)lV#s{3HDW~=c4;}+b>G+DF1cJqv~ z``GV}kmf#un1KPXnfgp9hePzFQNqVq$C>)yBx*4=`^3@VSbSJ@Snl9i4Ne_X%`63A zL0>Q&C196N6KKk5v>ciZ!UG%vc8Xc`&*sLI#STZiqG;fIxGIV^n>DFqJ+`*&ehB=u z{fK)d4Wvwz!2(+WQ+c1(c_W_CYO;B{A|ANZ?e?T<~CC%LiF*lQF7ia!P?I~}4 zczxH1DuN21Gll2F2FpI^f5X{9S*>Y(Q0yB22;hsyn)p~95olcxyE&RQ`}RjqV+H>% z5q6^mz{$l~7Kz{DRx2mw2p@{Ja=YK&yJieNcA0e_X%yjSfVx zm?DcRY8y1)LC6wWcupkH{(+^&qb`=nMy3f`Z&X@W+k%kh7J-;~8%A_B7Grq9%`CSq zu*Rzm&7X#KBBP1*VLwfi=B~w#J7719&3(Qu!PE^v&PcZ>U;GyC>j(F7sT6^pze`GxAmcB%fnlnAp2@1&oILaw^vZ=kSn}_pt z@}m2>EP^|C`=MT=>I3X%pe389^ANKKY*Y4{;_UGrA82%WJBZVe)=DR!fLnPY;e8q# znW|7cfCQk~8LGC4kc?MPeOUcAa3+|u)zCCwZ7zYaCU}lB$}_)V^r4#9wi12qmf^HV z>284)rO8EFaJHg0AD!qEQHi55p?RVap-`h;a+jHA2Y;Yo{sj;o>tDrjYV1IxiXfzk zEW*qdI8Y>7>C4f$d$yNmKzl+ps=QE-P*zJ@J+*^E^sVuA3HJ3glr!~7Ck9v^`5qQB zR@d3IxV^mx`xIXq(vD?lSfL;OGJQ06>(z0)w?iPo9xi;wTq)+`LKjjwXipga@!P9^ z{6?lJ^j9i}`5$HN6?oDmGCF-Q97#0+iH(OzVeDf83xra8b%nAWYjlle;hpSq_ha^8 zUm4V%C;I1bDi@-Q**f;BW?O)DV(?pGKSZ*5^B!u9`!AL^{|ADdIv|Nc_5WDwnJ|FebiF((Rcoq7r7jZfK&Inj zeOKxR@~g0kDyFDRm_8f4T`*t3l9+(2Oy%bDg(KP7iEKpeFQH8K~bKiPamAjS>dH?7O;BH8(2D$Z4Q zi0x^t!7xNXA7nR$a_YRN*=_G&3!sE7=h^>PU+|m*TErd~N~)&7^TTvp$OcvRZQePd zZEAFsUe|c$8$Z}A50iS(;?LvmVAKw|l})vQkUsPUYz%??y$8epx?^gDp5QM90(&op zAN9gN$D?riEXn1ei0PeNeRQ|LQ80#p(*vmB(3iiJZT25ZeEO5$zhgP^%K|YsD%q<0 z%3=Q_-A~JG)}jhXIvcI(`nFVkqtbAc+W*kfn4GgWAiV!p)9;j3{(rPI+yATJ0ka~a zZjJk&M=`bj&mLus5qeRb8S0v}EnAhjXP{yD^^5yI;{`P1e;Y5525StP5}9+YsJF0p zy8fP*$^~~>Hu@vM@;89#jjsXFc)|fOAyJsff{-<4a-1ER;G`1(S_GpH^E2^hj6(FA z7MS?gX7hjb691=Oynpp(aF#J*`bMip^P?eKVFkOqS%4KAR{Cc%-*skyn5m?J=BC?g zRM{d>J`nd9yWKl~N@fL2S$aB%8Q9gaX1^QO9s!dvj{+%fa6fti>px8A68`5s`~TkK zskjB>#h8m|CX)H;M4yP}s`f ZK59Gu%GLS1D1rj~)RncB>J+Ts{6FKc$3Fl7 literal 0 HcmV?d00001 diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 19205385f311b..fc5f254035a53 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -556,13 +556,13 @@ orthogonal matching pursuit can approximate the optimum solution vector with a fixed number of non-zero elements: .. math:: - \underset{\gamma}{\operatorname{arg\,min\,}} ||y - X\gamma||_2^2 \text{ subject to } ||\gamma||_0 \leq n_{\text{nonzero\_coefs}} + \underset{w}{\operatorname{arg\,min\,}} ||y - Xw||_2^2 \text{ subject to } ||w||_0 \leq n_{\text{nonzero\_coefs}} Alternatively, orthogonal matching pursuit can target a specific error instead of a specific number of non-zero coefficients. This can be expressed as: .. math:: - \underset{\gamma}{\operatorname{arg\,min\,}} ||\gamma||_0 \text{ subject to } ||y-X\gamma||_2^2 \leq \text{tol} + \underset{w}{\operatorname{arg\,min\,}} ||w||_0 \text{ subject to } ||y-Xw||_2^2 \leq \text{tol} OMP is based on a greedy algorithm that includes at each step the atom most @@ -906,7 +906,7 @@ with 'log' loss, which might be even faster but requires more tuning. It is possible to obtain the p-values and confidence intervals for coefficients in cases of regression without penalization. The `statsmodels package ` natively supports this. - Within sklearn, one could use bootstrapping instead as well. + Within sklearn, one could use bootstrapping instead as well. :class:`LogisticRegressionCV` implements Logistic Regression with built-in @@ -928,6 +928,149 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized Linear Regression +============================= + +Generalized Linear Models (GLM) extend linear models in two ways +[10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, X) = h(Xw). + +Secondly, the squared loss function is replaced by the unit deviance +:math:`d` of a distribution in the exponential family (or more precisely, a +reproductive exponential dispersion model (EDM) [11]_). + +The minimization problem becomes: + +.. math:: \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2, + +where :math:`\alpha` is the L2 regularization penalty. When sample weights are +provided, the average becomes a weighted average. + +The following table lists some specific EDMs and their unit deviance (all of +these are instances of the Tweedie family): + +================= =============================== ============================================ +Distribution Target Domain Unit Deviance :math:`d(y, \hat{y})` +================= =============================== ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`(y-\hat{y})^2` +Poisson :math:`y \in [0, \infty)` :math:`2(y\log\frac{y}{\hat{y}}-y+\hat{y})` +Gamma :math:`y \in (0, \infty)` :math:`2(\log\frac{\hat{y}}{y}+\frac{y}{\hat{y}}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\frac{(y-\hat{y})^2}{y\hat{y}^2}` +================= =============================== ============================================ + +The Probability Density Functions (PDF) of these distributions are illustrated +in the following figure, + +.. figure:: ./glm_data/poisson_gamma_tweedie_distributions.png + :align: center + :scale: 100% + + PDF of a random variable Y following Poisson, Tweedie (power=1.5) and Gamma + distributions with different mean values (:math:`\mu`). Observe the point + mass at :math:`Y=0` for the Poisson distribution and the Tweedie (power=1.5) + distribution, but not for the Gamma distribution which has a strictly + positive target domain. + +The choice of the distribution depends on the problem at hand: + +* If the target values :math:`y` are counts (non-negative integer valued) or + relative frequencies (non-negative), you might use a Poisson deviance + with log-link. +* If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. +* If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). + + +Examples of use cases include: + +* Agriculture / weather modeling: number of rain events per year (Poisson), + amount of rainfall per event (Gamma), total rainfall per year (Tweedie / + Compound Poisson Gamma). +* Risk modeling / insurance policy pricing: number of claim events / + policyholder per year (Poisson), cost per event (Gamma), total cost per + policyholder per year (Tweedie / Compound Poisson Gamma). +* Predictive maintenance: number of production interruption events per year: + Poisson, duration of interruption: Gamma, total interruption time per year + (Tweedie / Compound Poisson Gamma). + + +.. topic:: References: + + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ + +Usage +----- + +:class:`TweedieRegressor` implements a generalized linear model for the +Tweedie distribution, that allows to model any of the above mentioned +distributions using the appropriate ``power`` parameter. In particular: + +- ``power = 0``: Normal distribution. Specific estimators such as + :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in + this case. +- ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed + for convenience. However, it is strictly equivalent to + `TweedieRegressor(power=1, link='log')`. +- ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for + convenience. However, it is strictly equivalent to + `TweedieRegressor(power=2, link='log')`. +- ``power = 3``: Inverse Gaussian distribution. + +The link function is determined by the `link` parameter. + +Usage example:: + + >>> from sklearn.linear_model import TweedieRegressor + >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + TweedieRegressor(alpha=0.5, link='log', power=1) + >>> reg.coef_ + array([0.2463..., 0.4337...]) + >>> reg.intercept_ + -0.7638... + + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` + +Practical considerations +------------------------ + +The feature matrix `X` should be standardized before fitting. This ensures +that the penalty treats features equally. + +Since the linear predictor :math:`Xw` can be negative and Poisson, +Gamma and Inverse Gaussian distributions don't support negative values, it +is necessary to apply an inverse link function that guarantees the +non-negativeness. For example with `link='log'`, the inverse link function +becomes :math:`h(Xw)=\exp(Xw)`. + +If you want to model a relative frequency, i.e. counts per exposure (time, +volume, ...) you can do so by using a Poisson distribution and passing +:math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values +together with :math:`\mathrm{exposure}` as sample weights. For a concrete +example see e.g. +:ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. + +When performing cross-validation for the `power` parameter of +`TweedieRegressor`, it is advisable to specify an explicit `scoring` function, +because the default scorer :meth:`TweedieRegressor.score` is a function of +`power` itself. Stochastic Gradient Descent - SGD ================================= diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index f2401ee425bdb..eeaf89c174c2d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -221,6 +221,13 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |MajorFeature| Added generalized linear models (GLM) with non normal error + distributions, including :class:`linear_model.PoissonRegressor`, + :class:`linear_model.GammaRegressor` and :class:`linear_model.TweedieRegressor` + which use Poisson, Gamma and Tweedie distributions respectively. + :pr:`14300` by :user:`Christian Lorentzen `, `Roman Yurchak`_, + and `Olivier Grisel`_. + - |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and :class:`linear_model:Lasso` for dense feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen `. diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py new file mode 100644 index 0000000000000..ee863dd4198ba --- /dev/null +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -0,0 +1,455 @@ +""" +====================================== +Poisson regression and non-normal loss +====================================== + +This example illustrates the use of log-linear Poisson regression +on the `French Motor Third-Party Liability Claims dataset +`_ from [1]_ and compares +it with models learned with least squared error. In this dataset, each sample +corresponds to an insurance policy, i.e. a contract within an insurance +company and an individual (policiholder). Available features include driver +age, vehicle age, vehicle power, etc. + +A few definitions: a *claim* is the request made by a policyholder to the +insurer to compensate for a loss covered by the insurance. The *exposure* is +the duration of the insurance coverage of a given policy, in years. + +Our goal is to predict the expected number of insurance claims (or frequency) +following car accidents for a policyholder given the historical data over a +population of policyholders. + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +import warnings + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.datasets import fetch_openml +from sklearn.dummy import DummyRegressor +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import Ridge, PoissonRegressor +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.ensemble import RandomForestRegressor +from sklearn.utils import gen_even_slices +from sklearn.metrics import auc + +from sklearn.metrics import mean_squared_error, mean_absolute_error +from sklearn.metrics import mean_poisson_deviance + + +def load_mtpl2(n_samples=100000): + """Fetch the French Motor Third-Party Liability Claims dataset. + + Parameters + ---------- + n_samples: int or None, default=100000 + Number of samples to select (for faster run time). If None, the full + dataset with 678013 samples is returned. + """ + + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df = fetch_openml(data_id=41214, as_frame=True)['data'] + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + if n_samples is not None: + return df.iloc[:n_samples] + return df + + +############################################################################## +# Let's load the motor claim dataset. We ignore the severity data for this +# study for the sake of simplicitly. +# +# We also subsample the data for the sake of computational cost and running +# time. Using the full dataset would lead to similar conclusions. + +df = load_mtpl2(n_samples=300000) + +# Correct for unreasonable observations (that might be data error) +df["Exposure"] = df["Exposure"].clip(upper=1) + +############################################################################## +# The remaining columns can be used to predict the frequency of claim events. +# Those columns are very heterogeneous with a mix of categorical and numeric +# variables with different scales, possibly very unevenly distributed. +# +# In order to fit linear models with those predictors it is therefore +# necessary to perform standard feature transformations as follows: + +log_scale_transformer = make_pipeline( + FunctionTransformer(np.log, validate=False), + StandardScaler() +) + +linear_model_preprocessor = ColumnTransformer( + [ + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ], + remainder="drop", +) + +############################################################################## +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval +# (``Exposure``, in units of years). Here we model the frequency +# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, +# and use ``Exposure`` as ``sample_weight``. + +df["Frequency"] = df["ClaimNb"] / df["Exposure"] + +print( + pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() +) + +print("Average Frequency = {}" + .format(np.average(df["Frequency"], weights=df["Exposure"]))) + +print("Percentage of zero claims = {0:%}" + .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / + df["Exposure"].sum())) + +############################################################################## +# It is worth noting that 92 % of policyholders have zero claims, and if we +# were to convert this problem into a binary classification task, it would be +# significantly imbalanced. +# +# To evaluate the pertinence of the used metrics, we will consider as a +# baseline a "dummy" estimator that constantly predicts the mean frequency of +# the training sample. + +df_train, df_test = train_test_split(df, random_state=0) + +dummy = make_pipeline( + linear_model_preprocessor, + DummyRegressor(strategy='mean') +) +dummy.fit(df_train, df_train["Frequency"], + dummyregressor__sample_weight=df_train["Exposure"]) + + +def score_estimator(estimator, df_test): + """Score an estimator on the test set.""" + + y_pred = estimator.predict(df_test) + + print("MSE: %.3f" % + mean_squared_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) + print("MAE: %.3f" % + mean_absolute_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) + + # ignore non-positive predictions, as they are invalid for + # the Poisson deviance + mask = y_pred > 0 + if (~mask).any(): + warnings.warn("Estimator yields non-positive predictions for {} " + "samples out of {}. These will be ignored while " + "computing the Poisson deviance" + .format((~mask).sum(), mask.shape[0])) + + print("mean Poisson deviance: %.3f" % + mean_poisson_deviance(df_test["Frequency"][mask], + y_pred[mask], + df_test["Exposure"][mask])) + + +print("Constant mean frequency evaluation:") +score_estimator(dummy, df_test) + +############################################################################## +# We start by modeling the target variable with the least squares linear +# regression model, + +ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0)) +ridge.fit(df_train, df_train["Frequency"], + ridge__sample_weight=df_train["Exposure"]) + +############################################################################## +# The Poisson deviance cannot be computed on non-positive values predicted by +# the model. For models that do return a few non-positive predictions +# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, +# meaning that the obtained Poisson deviance is approximate. An alternative +# approach could be to use :class:`compose.TransformedTargetRegressor` +# meta-estimator to map ``y_pred`` to a strictly positive domain. + +print("Ridge evaluation:") +score_estimator(ridge, df_test) + +############################################################################## +# Next we fit the Poisson regressor on the target variable. We set the +# regularization strength ``alpha`` to 1 over number of samples in oder to +# mimic the Ridge regressor whose L2 penalty term scales differently with the +# number of samples. + +poisson = make_pipeline( + linear_model_preprocessor, + PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) +) +poisson.fit(df_train, df_train["Frequency"], + poissonregressor__sample_weight=df_train["Exposure"]) + +print("PoissonRegressor evaluation:") +score_estimator(poisson, df_test) + +############################################################################## +# Finally, we will consider a non-linear model, namely a random forest. Random +# forests do not require the categorical data to be one-hot encoded: instead, +# we can encode each category label with an arbitrary integer using +# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will +# treat the categorical features as ordered features, which might not be always +# a desired behavior. However this effect is limited for deep enough trees +# which are able to recover the categorical nature of the features. The main +# advantage of the :class:`preprocessing.OrdinalEncoder` over the +# :class:`preprocessing.OneHotEncoder` is that it will make training faster. + +rf_preprocessor = ColumnTransformer( + [ + ("categorical", OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("numeric", "passthrough", + ["VehAge", "DrivAge", "BonusMalus", "Density"]), + ], + remainder="drop", +) +rf = make_pipeline( + rf_preprocessor, + RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) +) +rf.fit(df_train, df_train["Frequency"].values, + randomforestregressor__sample_weight=df_train["Exposure"].values) + + +print("RandomForestRegressor evaluation:") +score_estimator(rf, df_test) + + +############################################################################## +# Like the Ridge regression above, the random forest model minimizes the +# conditional squared error, too. However, because of a higher predictive +# power, it also results in a smaller Poisson deviance than the Poisson +# regression model. +# +# Evaluating models with a single train / test split is prone to random +# fluctuations. If computing resources allow, it should be verified that +# cross-validated performance metrics would lead to similar conclusions. +# +# The qualitative difference between these models can also be visualized by +# comparing the histogram of observed target values with that of predicted +# values: + +fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True) +fig.subplots_adjust(bottom=0.2) +n_bins = 20 +for row_idx, label, df in zip(range(2), + ["train", "test"], + [df_train, df_test]): + df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), + ax=axes[row_idx, 0]) + + axes[row_idx, 0].set_title("Data") + axes[row_idx, 0].set_yscale('log') + axes[row_idx, 0].set_xlabel("y (observed Frequency)") + axes[row_idx, 0].set_ylim([1e1, 5e5]) + axes[row_idx, 0].set_ylabel(label + " samples") + + for idx, model in enumerate([ridge, poisson, rf]): + y_pred = model.predict(df) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), + ax=axes[row_idx, idx+1]) + axes[row_idx, idx + 1].set( + title=model[-1].__class__.__name__, + yscale='log', + xlabel="y_pred (predicted expected Frequency)" + ) +plt.tight_layout() + +############################################################################## +# The experimental data presents a long tail distribution for ``y``. In all +# models we predict a mean expected value, so we will have necessarily fewer +# extreme values. Additionally, the normal distribution used in ``Ridge`` and +# ``RandomForestRegressor`` has a constant variance, while for the Poisson +# distribution used in ``PoissonRegressor``, the variance is proportional to +# the mean predicted value. +# +# Thus, among the considered estimators, ``PoissonRegressor`` is better suited +# for modeling the long tail distribution of the data as compared to the +# ``Ridge`` and ``RandomForestRegressor`` estimators. +# +# To ensure that estimators yield reasonable predictions for different +# policyholder types, we can bin test samples according to ``y_pred`` returned +# by each model. Then for each bin, we compare the mean predicted ``y_pred``, +# with the mean observed target: + + +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, + n_bins=100): + """Compare predictions and observations for bins ordered by y_pred. + + We order the samples by ``y_pred`` and split it in bins. + In each bin the observed mean is compared with the predicted mean. + + Parameters + ---------- + y_true: array-like of shape (n_samples,) + Ground truth (correct) target values. + y_pred: array-like of shape (n_samples,) + Estimated target values. + sample_weight : array-like of shape (n_samples,) + Sample weights. + n_bins: int + Number of bins to use. + + Returns + ------- + bin_centers: ndarray of shape (n_bins,) + bin centers + y_true_bin: ndarray of shape (n_bins,) + average y_pred for each bin + y_pred_bin: ndarray of shape (n_bins,) + average y_pred for each bin + """ + idx_sort = np.argsort(y_pred) + bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins + y_pred_bin = np.zeros(n_bins) + y_true_bin = np.zeros(n_bins) + + for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): + weights = sample_weight[idx_sort][sl] + y_pred_bin[n] = np.average( + y_pred[idx_sort][sl], weights=weights + ) + y_true_bin[n] = np.average( + y_true[idx_sort][sl], + weights=weights + ) + return bin_centers, y_true_bin, y_pred_bin + + +fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) +plt.subplots_adjust(wspace=0.3) + +for axi, model in zip(ax, [ridge, poisson, rf]): + y_pred = model.predict(df_test) + + q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( + df_test["Frequency"].values, + y_pred, + sample_weight=df_test["Exposure"].values, + n_bins=10) + + axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") + axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") + axi.set_xlim(0, 1.0) + axi.set_ylim(0, 0.6) + axi.set( + title=model[-1].__class__.__name__, + xlabel='Fraction of samples sorted by y_pred', + ylabel='Mean Frequency (y_pred)' + ) + axi.legend() +plt.tight_layout() + +############################################################################## +# The ``Ridge`` regression model can predict very low expected frequencies +# that do not match the data. It can therefore severly under-estimate the risk +# for some policyholders. +# +# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency +# between predicted and observed targets, especially for low predicted target +# values. +# +# However, for some business applications, we are not necessarily interested +# in the ability of the model to predict the expected frequency value, but +# instead to predict which policyholder groups are the riskiest and which are +# the safest. In this case, the model evaluation would cast the problem as a +# ranking problem rather than a regression problem. +# +# To compare the 3 models within this perspective, one can plot the fraction of +# the number of claims vs the fraction of exposure for test samples ordered by +# the model predictions, from riskiest to safest according to each model: + + +def _cumulated_claims(y_true, y_pred, exposure): + idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest + sorted_exposure = exposure[idx_sort] + sorted_frequencies = y_true[idx_sort] + cumulated_exposure = np.cumsum(sorted_exposure) + cumulated_exposure /= cumulated_exposure[-1] + cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + cumulated_claims /= cumulated_claims[-1] + return cumulated_exposure, cumulated_claims + + +fig, ax = plt.subplots(figsize=(8, 8)) + +for model in [ridge, poisson, rf]: + y_pred = model.predict(df_test) + cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + y_pred, + df_test["Exposure"].values) + area = auc(cum_exposure, cum_claims) + label = "{} (area under curve: {:.3f})".format( + model[-1].__class__.__name__, area) + ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + +# Oracle model: y_pred == y_test +cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + df_test["Frequency"].values, + df_test["Exposure"].values) +area = auc(cum_exposure, cum_claims) +label = "Oracle (area under curve: {:.3f})".format(area) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) + +# Random Baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") +ax.set( + title="Cumulated number of claims by model", + xlabel='Fraction of exposure (from riskiest to safest)', + ylabel='Fraction of number of claims' +) +ax.legend(loc="lower right") + +############################################################################## +# This plot reveals that the random forest model is slightly better at ranking +# policyholders by risk profiles even if the absolute value of the predicted +# expected frequencies are less well calibrated than for the linear Poisson +# model. +# +# All three models are significantly better than chance but also very far from +# making perfect predictions. +# +# This last point is expected due to the nature of the problem: the occurrence +# of accidents is mostly dominated by circumstantial causes that are not +# captured in the columns of the dataset or that are indeed random. + +plt.show() diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py new file mode 100644 index 0000000000000..ccd18c8efff99 --- /dev/null +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -0,0 +1,596 @@ +""" +====================================== +Tweedie regression on insurance claims +====================================== + +This example illustrates the use of Poisson, Gamma and Tweedie regression on +the `French Motor Third-Party Liability Claims dataset +`_, and is inspired by an R tutorial [1]_. + +In this dataset, each sample corresponds to an insurance policy, i.e. a +contract within an insurance company and an individual (policyholder). +Available features include driver age, vehicle age, vehicle power, etc. + +A few definitions: a *claim* is the request made by a policyholder to the +insurer to compensate for a loss covered by the insurance. The *claim amount* +is the amount of money that the insurer must pay. The *exposure* is the +duration of the insurance coverage of a given policy, in years. + +Here our goal goal is to predict the expected +value, i.e. the mean, of the total claim amount per exposure unit also +referred to as the pure premium. + +There are several possibilities to do that, two of which are: + +1. Model the number of claims with a Poisson distribution, and the average + claim amount per claim, also known as severity, as a Gamma distribution + and multiply the predictions of both in order to get the total claim + amount. +2. Model the total claim amount per exposure directly, typically with a Tweedie + distribution of Tweedie power :math:`p \\in (1, 2)`. + +In this example we will illustrate both approaches. We start by defining a few +helper functions for loading the data and visualizing results. + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 + `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# Olivier Grisel +# License: BSD 3 clause +from functools import partial + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.datasets import fetch_openml +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import PoissonRegressor, GammaRegressor +from sklearn.linear_model import TweedieRegressor +from sklearn.metrics import mean_tweedie_deviance +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer + +from sklearn.metrics import mean_absolute_error, mean_squared_error, auc + + +def load_mtpl2(n_samples=100000): + """Fetch the French Motor Third-Party Liability Claims dataset. + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). Full dataset has + 678013 samples. + """ + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] + df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) + df_freq.set_index('IDpol', inplace=True) + + # freMTPL2sev dataset from https://www.openml.org/d/41215 + df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby('IDpol').sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, + title=None, ax=None, fill_legend=False): + """Plot observed and predicted - aggregated per feature level. + + Parameters + ---------- + df : DataFrame + input data + feature: str + a column name of df for the feature to be plotted + weight : str + column name of df with the values of weights or exposure + observed : str + a column name of df with the observed target + predicted : DataFrame + a dataframe, with the same index as df, with the predicted target + fill_legend : bool, default=False + whether to show fill_between legend + """ + # aggregate observed and predicted variables by feature level + df_ = df.loc[:, [feature, weight]].copy() + df_["observed"] = df[observed] * df[weight] + df_["predicted"] = predicted * df[weight] + df_ = ( + df_.groupby([feature])[weight, "observed", "predicted"] + .sum() + .assign(observed=lambda x: x["observed"] / x[weight]) + .assign(predicted=lambda x: x["predicted"] / x[weight]) + ) + + ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) + y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 + p2 = ax.fill_between( + df_.index, + 0, + y_max * df_[weight] / df_[weight].values.max(), + color="g", + alpha=0.1, + ) + if fill_legend: + ax.legend([p2], ["{} distribution".format(feature)]) + ax.set( + ylabel=y_label if y_label is not None else None, + title=title if title is not None else "Train: Observed vs Predicted", + ) + + +def score_estimator( + estimator, X_train, X_test, df_train, df_test, target, weights, + tweedie_powers=None, +): + """Evaluate an estimator on train and test sets with different metrics""" + + metrics = [ + ("D² explained", None), # Use default scorer if it exists + ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), + ] + if tweedie_powers: + metrics += [( + "mean Tweedie dev p={:.4f}".format(power), + partial(mean_tweedie_deviance, power=power) + ) for power in tweedie_powers] + + res = [] + for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), + ]: + y, _weights = df[target], df[weights] + for score_label, metric in metrics: + if isinstance(estimator, tuple) and len(estimator) == 2: + # Score the model consisting of the product of frequency and + # severity models. + est_freq, est_sev = estimator + y_pred = est_freq.predict(X) * est_sev.predict(X) + else: + y_pred = estimator.predict(X) + + if metric is None: + if not hasattr(estimator, "score"): + continue + score = estimator.score(X, y, _weights) + else: + score = metric(y, y_pred, _weights) + + res.append( + {"subset": subset_label, "metric": score_label, "score": score} + ) + + res = ( + pd.DataFrame(res) + .set_index(["metric", "subset"]) + .score.unstack(-1) + .round(4) + .loc[:, ['train', 'test']] + ) + return res + + +############################################################################## +# Loading datasets, basic feature extraction and target definitions +# ----------------------------------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``), with the freMTPL2sev table, +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). + +df = load_mtpl2(n_samples=60000) + +# Note: filter out claims with zero amount, as the severity model +# requires strictly positive target values. +df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0 + +# Correct for unreasonable observations (that might be data error) +# and a few exceptionally large claim amounts +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) +df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) + +log_scale_transformer = make_pipeline( + FunctionTransformer(func=np.log), + StandardScaler() +) + +column_trans = ColumnTransformer( + [ + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + +# Insurances companies are interested in modeling the Pure Premium, that is +# the expected total claim amount per unit of exposure for each policyholder +# in their portfolio: +df["PurePremium"] = df["ClaimAmount"] / df["Exposure"] + +# This can be indirectly approximated by a 2-step modeling: the product of the +# Frequency times the average claim amount per claim: +df["Frequency"] = df["ClaimNb"] / df["Exposure"] +df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) + +with pd.option_context("display.max_columns", 15): + print(df[df.ClaimAmount > 0].head()) + +############################################################################## +# +# Frequency model -- Poisson distribution +# --------------------------------------- +# +# The number of claims (``ClaimNb``) is a positive integer (0 included). +# Thus, this target can be modelled by a Poisson distribution. +# It is then assumed to be the number of discrete events occuring with a +# constant rate in a given time interval (``Exposure``, in units of years). +# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a +# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) + +# The parameters of the model are estimated by minimizing the Poisson deviance +# on the training set via a quasi-Newton solver: l-BFGS. Some of the features +# are collinear, we use a weak penalization to avoid numerical issues. +glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400) +glm_freq.fit(X_train, df_train["Frequency"], + sample_weight=df_train["Exposure"]) + +scores = score_estimator( + glm_freq, + X_train, + X_test, + df_train, + df_test, + target="Frequency", + weights="Exposure", +) +print("Evaluation of PoissonRegressor on target Frequency") +print(scores) + +############################################################################## +# We can visually compare observed and predicted values, aggregated by the +# drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance +# bonus/malus (``BonusMalus``). + +fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8)) +fig.subplots_adjust(hspace=0.3, wspace=0.2) + +plot_obs_pred( + df=df_train, + feature="DrivAge", + weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_train), + y_label="Claim Frequency", + title="train data", + ax=ax[0, 0], +) + +plot_obs_pred( + df=df_test, + feature="DrivAge", + weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), + y_label="Claim Frequency", + title="test data", + ax=ax[0, 1], + fill_legend=True +) + +plot_obs_pred( + df=df_test, + feature="VehAge", + weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), + y_label="Claim Frequency", + title="test data", + ax=ax[1, 0], + fill_legend=True +) + +plot_obs_pred( + df=df_test, + feature="BonusMalus", + weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), + y_label="Claim Frequency", + title="test data", + ax=ax[1, 1], + fill_legend=True +) + + +############################################################################## +# According to the observed data, the frequency of accidents is higher for +# drivers younger than 30 years old, and is positively correlated with the +# `BonusMalus` variable. Our model is able to mostly correctly model this +# behaviour. +# +# Severity Model - Gamma distribution +# ------------------------------------ +# The mean claim amount or severity (`AvgClaimAmount`) can be empirically +# shown to follow approximately a Gamma distribution. We fit a GLM model for +# the severity with the same features as the frequency model. +# +# Note: +# +# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support +# on :math:`(0, \infty)`, not :math:`[0, \infty)`. +# - We use ``ClaimNb`` as `sample_weight` to account for policies that contain +# more than one claim. + +mask_train = df_train["ClaimAmount"] > 0 +mask_test = df_test["ClaimAmount"] > 0 + +glm_sev = GammaRegressor(alpha=10., max_iter=10000) + +glm_sev.fit( + X_train[mask_train.values], + df_train.loc[mask_train, "AvgClaimAmount"], + sample_weight=df_train.loc[mask_train, "ClaimNb"], +) + +scores = score_estimator( + glm_sev, + X_train[mask_train.values], + X_test[mask_test.values], + df_train[mask_train], + df_test[mask_test], + target="AvgClaimAmount", + weights="ClaimNb", +) +print("Evaluation of GammaRegressor on target AvgClaimAmount") +print(scores) + +############################################################################## +# Here, the scores for the test data call for caution as they are +# significantly worse than for the training data indicating an overfit despite +# the strong regularization. +# +# Note that the resulting model is the average claim amount per claim. As +# such, it is conditional on having at least one claim, and cannot be used to +# predict the average claim amount per policy in general. + +print("Mean AvgClaim Amount per policy: %.2f " + % df_train["AvgClaimAmount"].mean()) +print("Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()) +print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" + % glm_sev.predict(X_train).mean()) + + +############################################################################## +# We can visually compare observed and predicted values, aggregated for +# the drivers age (``DrivAge``). + +fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6)) + +plot_obs_pred( + df=df_train.loc[mask_train], + feature="DrivAge", + weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_train[mask_train.values]), + y_label="Average Claim Severity", + title="train data", + ax=ax[0], +) + +plot_obs_pred( + df=df_test.loc[mask_test], + feature="DrivAge", + weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_test[mask_test.values]), + y_label="Average Claim Severity", + title="test data", + ax=ax[1], + fill_legend=True +) +plt.tight_layout() + +############################################################################## +# Overall, the drivers age (``DrivAge``) has a weak impact on the claim +# severity, both in observed and predicted data. +# +# Pure Premium Modeling via a Product Model vs single TweedieRegressor +# -------------------------------------------------------------------- +# As mentioned in the introduction, the total claim amount per unit of +# exposure can be modeled as the product of the prediction of the +# frequency model by the prediction of the severity model. +# +# Alternatively, one can directly model the total loss with a unique +# Compound Poisson Gamma generalized linear model (with a log link function). +# This model is a special case of the Tweedie GLM with a "power" parameter +# :math:`p \in (1, 2)`. Here, we fix apriori the `power` parameter of the +# Tweedie model to some arbitrary value (1.9) in the valid range. Ideally one +# would select this value via grid-search by minimizing the negative +# log-likelihood of the Tweedie model, but unfortunately the current +# implementation does not allow for this (yet). +# +# We will compare the performance of both approaches. +# To quantify the performance of both models, one can compute +# the mean deviance of the train and test data assuming a Compound +# Poisson-Gamma distribution of the total claim amount. This is equivalent to +# a Tweedie distribution with a `power` parameter between 1 and 2. +# +# The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power` +# parameter. As we do not know the true value of the `power` parameter, we here +# compute the mean deviances for a grid of possible values, and compare the +# models side by side, i.e. we compare them at identical values of `power`. +# Ideally, we hope that one model will be consistently better than the other, +# regardless of `power`. + +glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) +glm_pure_premium.fit(X_train, df_train["PurePremium"], + sample_weight=df_train["Exposure"]) + +tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] + +scores_product_model = score_estimator( + (glm_freq, glm_sev), + X_train, + X_test, + df_train, + df_test, + target="PurePremium", + weights="Exposure", + tweedie_powers=tweedie_powers, +) + +scores_glm_pure_premium = score_estimator( + glm_pure_premium, + X_train, + X_test, + df_train, + df_test, + target="PurePremium", + weights="Exposure", + tweedie_powers=tweedie_powers +) + +scores = pd.concat([scores_product_model, scores_glm_pure_premium], + axis=1, sort=True, + keys=('Product Model', 'TweedieRegressor')) +print("Evaluation of the Product Model and the Tweedie Regressor " + "on target PurePremium") +with pd.option_context('display.expand_frame_repr', False): + print(scores) + +############################################################################## +# In this example, both modeling approaches yield comparable performance +# metrics. For implementation reasons, the percentage of explained variance +# :math:`D^2` is not available for the product model. +# +# We can additionally validate these models by comparing observed and +# predicted total claim amount over the test and train subsets. We see that, +# on average, both model tend to underestimate the total claim (but this +# behavior depends on the amount of regularization). + +res = [] +for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), +]: + exposure = df["Exposure"].values + res.append( + { + "subset": subset_label, + "observed": df["ClaimAmount"].values.sum(), + "predicted, frequency*severity model": np.sum( + exposure * glm_freq.predict(X) * glm_sev.predict(X) + ), + "predicted, tweedie, power=%.2f" + % glm_pure_premium.power: np.sum( + exposure * glm_pure_premium.predict(X)), + } + ) + +print(pd.DataFrame(res).set_index("subset").T) + +############################################################################## +# Finally, we can compare the two models using a plot of cumulated claims: for +# each model, the policyholders are ranked from safest to riskiest and the +# fraction of observed total cumulated claims is plotted on the y axis. This +# plot is often called the ordered Lorenz curve of the model. +# +# The Gini coefficient (based on the area under the curve) can be used as a +# model selection metric to quantify the ability of the model to rank +# policyholders. Note that this metric does not reflect the ability of the +# models to make accurate predictions in terms of absolute value of total +# claim amounts but only in terms of relative amounts as a ranking metric. +# +# Both models are able to rank policyholders by risky-ness significantly +# better than chance although they are also both far from perfect due to the +# natural difficulty of the prediction problem from few features. +# +# Note that the Gini index only characterize the ranking performance of the +# model but not its calibration: any monotonic transformation of the +# predictions leaves the Gini index of the model unchanged. +# +# Finally one should highlight that the Compound Poisson Gamma model that +# is directly fit on the pure premium is operationally simpler to develop and +# maintain as it consists in a single scikit-learn estimator instead of a +# pair of models, each with its own set of hyperparameters. + + +def lorenz_curve(y_true, y_pred, exposure): + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + exposure = np.asarray(exposure) + + # order samples by increasing predicted risk: + ranking = np.argsort(y_pred) + ranked_exposure = exposure[ranking] + ranked_pure_premium = y_true[ranking] + cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure) + cumulated_claim_amount /= cumulated_claim_amount[-1] + cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount)) + return cumulated_samples, cumulated_claim_amount + + +fig, ax = plt.subplots(figsize=(8, 8)) + +y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) +y_pred_total = glm_pure_premium.predict(X_test) + +for label, y_pred in [("Frequency * Severity model", y_pred_product), + ("Compound Poisson Gamma", y_pred_total)]: + ordered_samples, cum_claims = lorenz_curve( + df_test["PurePremium"], y_pred, df_test["Exposure"]) + gini = 1 - 2 * auc(ordered_samples, cum_claims) + label += " (Gini index: {:.3f})".format(gini) + ax.plot(ordered_samples, cum_claims, linestyle="-", label=label) + +# Oracle model: y_pred == y_test +ordered_samples, cum_claims = lorenz_curve( + df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]) +gini = 1 - 2 * auc(ordered_samples, cum_claims) +label = "Oracle (Gini index: {:.3f})".format(gini) +ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", + label=label) + +# Random baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") +ax.set( + title="Lorenz Curves", + xlabel=('Fraction of policyholders\n' + '(ordered by model from safest to riskiest)'), + ylabel='Fraction of total claim amount' +) +ax.legend(loc="upper left") +plt.plot() diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py new file mode 100644 index 0000000000000..cb20fda1c022d --- /dev/null +++ b/sklearn/_loss/glm_distribution.py @@ -0,0 +1,355 @@ +""" +Distribution functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod +from collections import namedtuple +import numbers + +import numpy as np +from scipy.special import xlogy + + +DistributionBoundary = namedtuple("DistributionBoundary", + ("value", "inclusive")) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`, + unit variance :math:`v(y_\textrm{pred})` and + unit deviance :math:`d(y,y_\textrm{pred})`. + + Methods + ------- + deviance + deviance_derivative + in_y_range + unit_deviance + unit_deviance_derivative + unit_variance + + References + ---------- + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + + def in_y_range(self, y): + """Returns ``True`` if y is in the valid range of Y~EDM. + + Parameters + ---------- + y : array of shape (n_samples,) + Target values. + """ + # Note that currently supported distributions have +inf upper bound + + if not isinstance(self._lower_bound, DistributionBoundary): + raise TypeError('_lower_bound attribute must be of type ' + 'DistributionBoundary') + + if self._lower_bound.inclusive: + return np.greater_equal(y, self._lower_bound.value) + else: + return np.greater(y, self._lower_bound.value) + + @abstractmethod + def unit_variance(self, y_pred): + r"""Compute the unit variance function. + + The unit variance :math:`v(y_\textrm{pred})` determines the variance as + a function of the mean :math:`y_\textrm{pred}` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`. + It can also be derived from the unit deviance + :math:`d(y,y_\textrm{pred})` as + + .. math:: v(y_\textrm{pred}) = \frac{2}{ + \frac{\partial^2 d(y,y_\textrm{pred})}{ + \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}} + + See also :func:`variance`. + + Parameters + ---------- + y_pred : array of shape (n_samples,) + Predicted mean. + """ + + @abstractmethod + def unit_deviance(self, y, y_pred, check_input=False): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the + log-likelihood as + :math:`d(y,y_\textrm{pred}) = -2\phi\cdot + \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array of shape (n_samples,) + Target values. + + y_pred : array of shape (n_samples,) + Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or y_pred values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array of shape (n_samples,) + Computed deviance + """ + + def unit_deviance_derivative(self, y, y_pred): + r"""Compute the derivative of the unit deviance w.r.t. y_pred. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred}) + = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}` + with unit variance :math:`v(y_\textrm{pred})`. + + Parameters + ---------- + y : array of shape (n_samples,) + Target values. + + y_pred : array of shape (n_samples,) + Predicted mean. + """ + return -2 * (y - y_pred) / self.unit_variance(y_pred) + + def deviance(self, y, y_pred, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)` + with weights :math:`s_i` and unit deviance + :math:`d(y,y_\textrm{pred})`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,y_\textrm{pred},\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array of shape (n_samples,) + Target values. + + y_pred : array of shape (n_samples,) + Predicted mean. + + weights : {int, array of shape (n_samples,)}, default=1 + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, y_pred)) + + def deviance_derivative(self, y, y_pred, weights=1): + r"""Compute the derivative of the deviance w.r.t. y_pred. + + It gives :math:`\frac{\partial}{\partial y_\textrm{pred}} + D(y, \y_\textrm{pred}; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + y_pred : array, shape (n_samples,) + Predicted mean. + + weights : {int, array of shape (n_samples,)}, default=1 + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, y_pred) + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]` + is uniquely defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (1,2) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float, default=0 + The variance power of the `unit_variance` + :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`. + For ``0=1.') + elif 1 <= power < 2: + # Poisson or Compound Poisson distribution + self._lower_bound = DistributionBoundary(0, inclusive=True) + elif power >= 2: + # Gamma, Positive Stable, Inverse Gaussian distributions + self._lower_bound = DistributionBoundary(0, inclusive=False) + else: # pragma: no cover + # this branch should be unreachable. + raise ValueError + + self._power = power + + def unit_variance(self, y_pred): + """Compute the unit variance of a Tweedie distribution + v(y_\textrm{pred})=y_\textrm{pred}**power. + + Parameters + ---------- + y_pred : array of shape (n_samples,) + Predicted mean. + """ + return np.power(y_pred, self.power) + + def unit_deviance(self, y, y_pred, check_input=False): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the + log-likelihood as + :math:`d(y,y_\textrm{pred}) = -2\phi\cdot + \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array of shape (n_samples,) + Target values. + + y_pred : array of shape (n_samples,) + Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or y_pred values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array of shape (n_samples,) + Computed deviance + """ + p = self.power + + if check_input: + message = ("Mean Tweedie deviance error with power={} can only be " + "used on ".format(p)) + if p < 0: + # 'Extreme stable', y any realy number, y_pred > 0 + if (y_pred <= 0).any(): + raise ValueError(message + "strictly positive y_pred.") + elif p == 0: + # Normal, y and y_pred can be any real number + pass + elif 0 < p < 1: + raise ValueError("Tweedie deviance is only defined for " + "power<=0 and power>=1.") + elif 1 <= p < 2: + # Poisson and Compount poisson distribution, y >= 0, y_pred > 0 + if (y < 0).any() or (y_pred <= 0).any(): + raise ValueError(message + "non-negative y and strictly " + "positive y_pred.") + elif p >= 2: + # Gamma and Extreme stable distribution, y and y_pred > 0 + if (y <= 0).any() or (y_pred <= 0).any(): + raise ValueError(message + + "strictly positive y and y_pred.") + else: # pragma: nocover + # Unreachable statement + raise ValueError + + if p < 0: + # 'Extreme stable', y any realy number, y_pred > 0 + dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) + + elif p == 0: + # Normal distribution, y and y_pred any real number + dev = (y - y_pred)**2 + elif p < 1: + raise ValueError("Tweedie deviance is only defined for power<=0 " + "and power>=1.") + elif p == 1: + # Poisson distribution + dev = 2 * (xlogy(y, y/y_pred) - y + y_pred) + elif p == 2: + # Gamma distribution + dev = 2 * (np.log(y_pred/y) + y/y_pred - 1) + else: + dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) + return dev + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super().__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super().__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super().__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super().__init__(power=3) + + +EDM_DISTRIBUTIONS = { + 'normal': NormalDistribution, + 'poisson': PoissonDistribution, + 'gamma': GammaDistribution, + 'inverse-gaussian': InverseGaussianDistribution, +} diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py new file mode 100644 index 0000000000000..cb4c5ae07e4d1 --- /dev/null +++ b/sklearn/_loss/tests/test_glm_distribution.py @@ -0,0 +1,112 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause +import numpy as np +from numpy.testing import ( + assert_allclose, + assert_array_equal, +) +from scipy.optimize import check_grad +import pytest + +from sklearn._loss.glm_distribution import ( + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + DistributionBoundary +) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +def test_invalid_distribution_bound(): + dist = TweedieDistribution() + dist._lower_bound = 0 + with pytest.raises(TypeError, + match="must be of type DistributionBoundary"): + dist.in_y_range([-1, 0, 1]) + + +def test_tweedie_distribution_power(): + msg = "distribution is only defined for power<=0 and power>=1" + with pytest.raises(ValueError, match=msg): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert isinstance(dist._lower_bound, DistributionBoundary) + + assert dist._lower_bound.inclusive is False + dist.power = 1 + assert dist._lower_bound.value == 0.0 + assert dist._lower_bound.inclusive is True + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_allclose(family.deviance(x, x), 0, atol=1e-9) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), + PoissonDistribution(), + GammaDistribution(), + InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=-4)], + ids=lambda x: x.__class__.__name__ +) +def test_deviance_derivative(family): + """Test deviance derivative for different families.""" + rng = np.random.RandomState(0) + y_true = rng.rand(10) + # make data positive + y_true += np.abs(y_true.min()) + 1e-2 + + y_pred = y_true + np.fmax(rng.rand(10), 0.) + + dev = family.deviance(y_true, y_pred) + assert isinstance(dev, float) + dev_derivative = family.deviance_derivative(y_true, y_pred) + assert dev_derivative.shape == y_pred.shape + + err = check_grad( + lambda y_pred: family.deviance(y_true, y_pred), + lambda y_pred: family.deviance_derivative(y_true, y_pred), + y_pred, + ) / np.linalg.norm(dev_derivative) + assert abs(err) < 1e-6 diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 59d0600d508d0..110e0008bccc9 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -7,7 +7,6 @@ # complete documentation. from ._base import LinearRegression - from ._bayes import BayesianRidge, ARDRegression from ._least_angle import (Lars, LassoLars, lars_path, lars_path_gram, LarsCV, LassoLarsCV, LassoLarsIC) @@ -15,6 +14,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from ._glm import (PoissonRegressor, + GammaRegressor, TweedieRegressor) from ._huber import HuberRegressor from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from ._stochastic_gradient import SGDClassifier, SGDRegressor @@ -73,4 +74,7 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'PoissonRegressor', + 'GammaRegressor', + 'TweedieRegressor'] diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py new file mode 100644 index 0000000000000..3b5c0d95d6124 --- /dev/null +++ b/sklearn/linear_model/_glm/__init__.py @@ -0,0 +1,15 @@ +# License: BSD 3 clause + +from .glm import ( + GeneralizedLinearRegressor, + PoissonRegressor, + GammaRegressor, + TweedieRegressor +) + +__all__ = [ + "GeneralizedLinearRegressor", + "PoissonRegressor", + "GammaRegressor", + "TweedieRegressor" +] diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py new file mode 100644 index 0000000000000..8607d6a1828ab --- /dev/null +++ b/sklearn/linear_model/_glm/glm.py @@ -0,0 +1,615 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. +# License: BSD 3 clause + +import numbers + +import numpy as np +import scipy.optimize + +from ...base import BaseEstimator, RegressorMixin +from ...utils import check_array, check_X_y +from ...utils.optimize import _check_optimize_result +from ...utils.validation import check_is_fitted, _check_sample_weight +from ..._loss.glm_distribution import ( + ExponentialDispersionModel, + TweedieDistribution, + EDM_DISTRIBUTIONS +) +from .link import ( + BaseLink, + IdentityLink, + LogLink, +) + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _y_pred_deviance_derivative(coef, X, y, weights, family, link): + """Compute y_pred and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + y_pred = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * family.deviance_derivative(y, y_pred, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # same as X.T @ temp + return y_pred, devp + + +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): + """Regression via a penalized Generalized Linear Model (GLM). + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as y_pred=h(X*w). + Therefore, the fit minimizes the following objective function with L2 + priors as regularizer:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + + 1/2 * alpha * |w|_2 + + with inverse link function h and s=sample_weight. + The parameter ``alpha`` corresponds to the lambda parameter in glmnet. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X @ coef + intercept). + + family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \ + or an ExponentialDispersionModel instance, default='normal' + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. + + link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ + default='auto' + The link function of the GLM, i.e. mapping from linear predictor + `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets + the link depending on the chosen family as follows: + + - 'identity' for Normal distribution + - 'log' for Poisson, Gamma and Inverse Gaussian distributions + + solver : 'lbfgs', default='lbfgs' + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_``. + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + """ + def __init__(self, *, alpha=1.0, + fit_intercept=True, family='normal', link='auto', + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + verbose=0): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.verbose = verbose + + def fit(self, X, y, sample_weight=None): + """Fit a Generalized Linear Model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + self : returns an instance of self. + """ + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + self._family_instance = EDM_DISTRIBUTIONS[self.family]() + else: + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse-gaussian']" + "; got (family={0})".format(self.family)) + + # Guarantee that self._link_instance is set to an instance of + # class BaseLink + if isinstance(self.link, BaseLink): + self._link_instance = self.link + else: + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={})" + .format(self.family)) + elif self.link == 'identity': + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() + else: + raise ValueError( + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log']; " + "got (link={0})".format(self.link)) + + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be a non-negative number;" + " got (alpha={0})".format(self.alpha)) + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool;" + " got {0}".format(self.fit_intercept)) + if self.solver not in ['lbfgs']: + raise ValueError("GeneralizedLinearRegressor supports only solvers" + "'lbfgs'; got {0}".format(self.solver)) + solver = self.solver + if (not isinstance(self.max_iter, numbers.Integral) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) + + family = self._family_instance + link = self._link_instance + + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=[np.float64, np.float32], + y_numeric=True, multi_output=False) + + weights = _check_sample_weight(sample_weight, X) + + _, n_features = X.shape + + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that X is not rank deficient + + # rescaling of sample_weight + # + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) + weights = weights / weights.sum() + + if self.warm_start and hasattr(self, 'coef_'): + if self.fit_intercept: + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) + else: + coef = self.coef_ + else: + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) + + # algorithms for optimization + + if solver == 'lbfgs': + def func(coef, X, y, weights, alpha, family, link): + y_pred, devp = _y_pred_deviance_derivative( + coef, X, y, weights, family, link + ) + dev = family.deviance(y, y_pred, weights) + # offset if coef[0] is intercept + offset = 1 if self.fit_intercept else 0 + coef_scaled = alpha * coef[offset:] + obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled) + objp = 0.5 * devp + objp[offset:] += coef_scaled + return obj, objp + + args = (X, y, weights, self.alpha, family, link) + + opt_res = scipy.optimize.minimize( + func, coef, method="L-BFGS-B", jac=True, + options={ + "maxiter": self.max_iter, + "iprint": (self.verbose > 0) - 1, + "gtol": self.tol, + "ftol": 1e3*np.finfo(float).eps, + }, + args=args) + self.n_iter_ = _check_optimize_result("lbfgs", opt_res) + coef = opt_res.x + + if self.fit_intercept: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + # set intercept to zero as the other linear models do + self.intercept_ = 0. + self.coef_ = coef + + return self + + def _linear_predictor(self, X): + """Compute the linear_predictor = `X @ coef_ + intercept_`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + Returns + ------- + y_pred : array of shape (n_samples,) + Returns predicted values of linear predictor. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float64, np.float32], ensure_2d=True, + allow_nd=False) + return X @ self.coef_ + self.intercept_ + + def predict(self, X): + """Predict using GLM with feature matrix X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + Returns + ------- + y_pred : array of shape (n_samples,) + Returns predicted values. + """ + # check_array is done in _linear_predictor + eta = self._linear_predictor(X) + y_pred = self._link_instance.inverse(eta) + return y_pred + + def score(self, X, y, sample_weight=None): + """Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for ``family='normal'``. + + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) + True values of target. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) w.r.t. y. + """ + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + weights = _check_sample_weight(sample_weight, X) + y_pred = self.predict(X) + dev = self._family_instance.deviance(y, y_pred, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1 - dev / dev_null + + def _more_tags(self): + # create the _family_instance if fit wasn't called yet. + if hasattr(self, '_family_instance'): + _family_instance = self._family_instance + elif isinstance(self.family, ExponentialDispersionModel): + _family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + _family_instance = EDM_DISTRIBUTIONS[self.family]() + else: + raise ValueError + return {"requires_positive_y": not _family_instance.in_y_range(-1.0)} + + +class PoissonRegressor(GeneralizedLinearRegressor): + """Generalized Linear Model with a Poisson distribution. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X @ coef + intercept). + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + """ + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, verbose=verbose) + + @property + def family(self): + # Make this attribute read-only to avoid mis-uses e.g. in GridSearch. + return "poisson" + + @family.setter + def family(self, value): + if value != "poisson": + raise ValueError("PoissonRegressor.family must be 'poisson'!") + + +class GammaRegressor(GeneralizedLinearRegressor): + """Generalized Linear Model with a Gamma distribution. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X @ coef + intercept). + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X * coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + """ + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="gamma", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, verbose=verbose) + + @property + def family(self): + # Make this attribute read-only to avoid mis-uses e.g. in GridSearch. + return "gamma" + + @family.setter + def family(self, value): + if value != "gamma": + raise ValueError("GammaRegressor.family must be 'gamma'!") + + +class TweedieRegressor(GeneralizedLinearRegressor): + """Generalized Linear Model with a Tweedie distribution. + + This estimator can be used to model different GLMs depending on the + ``power`` parameter, which determines the underlying distribution. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + power : float, default=0 + The power determines the underlying target distribution according + to the following table: + + +-------+------------------------+ + | Power | Distribution | + +=======+========================+ + | 0 | Normal | + +-------+------------------------+ + | 1 | Poisson | + +-------+------------------------+ + | (1,2) | Compound Poisson Gamma | + +-------+------------------------+ + | 2 | Gamma | + +-------+------------------------+ + | 3 | Inverse Gaussian | + +-------+------------------------+ + + For ``0 < power < 1``, no distribution exists. + + alpha : float, default=1 + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + + link : {'auto', 'identity', 'log'}, default='auto' + The link function of the GLM, i.e. mapping from linear predictor + `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets + the link depending on the chosen family as follows: + + - 'identity' for Normal distribution + - 'log' for Poisson, Gamma and Inverse Gaussian distributions + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X @ coef + intercept). + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + """ + def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, + link='auto', max_iter=100, tol=1e-4, + warm_start=False, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family=TweedieDistribution(power=power), link=link, + max_iter=max_iter, tol=tol, + warm_start=warm_start, verbose=verbose) + + @property + def family(self): + # We use a property with a setter to make sure that the family is + # always a Tweedie distribution, and that self.power and + # self.family.power are identical by construction. + dist = TweedieDistribution(power=self.power) + # TODO: make the returned object immutable + return dist + + @family.setter + def family(self, value): + if isinstance(value, TweedieDistribution): + self.power = value.power + else: + raise TypeError("TweedieRegressor.family must be of type " + "TweedieDistribution!") diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py new file mode 100644 index 0000000000000..878d8e835bc42 --- /dev/null +++ b/sklearn/linear_model/_glm/link.py @@ -0,0 +1,110 @@ +""" +Link functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod + +import numpy as np +from scipy.special import expit, logit + + +class BaseLink(metaclass=ABCMeta): + """Abstract base class for Link functions.""" + + @abstractmethod + def __call__(self, y_pred): + """Compute the link function g(y_pred). + + The link function links the mean y_pred=E[Y] to the so called linear + predictor (X*w), i.e. g(y_pred) = linear predictor. + + Parameters + ---------- + y_pred : array of shape (n_samples,) + Usually the (predicted) mean. + """ + + @abstractmethod + def derivative(self, y_pred): + """Compute the derivative of the link g'(y_pred). + + Parameters + ---------- + y_pred : array of shape (n_samples,) + Usually the (predicted) mean. + """ + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linear predictor and the mean + y_pred=E[Y], i.e. h(linear predictor) = y_pred. + + Parameters + ---------- + lin_pred : array of shape (n_samples,) + Usually the (fitted) linear predictor. + """ + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array of shape (n_samples,) + Usually the (fitted) linear predictor. + """ + + +class IdentityLink(BaseLink): + """The identity link function g(x)=x.""" + + def __call__(self, y_pred): + return y_pred + + def derivative(self, y_pred): + return np.ones_like(y_pred) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + +class LogLink(BaseLink): + """The log link function g(x)=log(x).""" + + def __call__(self, y_pred): + return np.log(y_pred) + + def derivative(self, y_pred): + return 1 / y_pred + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(BaseLink): + """The logit link function g(x)=logit(x).""" + + def __call__(self, y_pred): + return logit(y_pred) + + def derivative(self, y_pred): + return 1 / (y_pred * (1 - y_pred)) + + def inverse(self, lin_pred): + return expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = expit(lin_pred) + return ep * (1 - ep) diff --git a/sklearn/linear_model/_glm/tests/__init__.py b/sklearn/linear_model/_glm/tests/__init__.py new file mode 100644 index 0000000000000..588cf7e93eef0 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/__init__.py @@ -0,0 +1 @@ +# License: BSD 3 clause diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py new file mode 100644 index 0000000000000..ece8f09c76acd --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -0,0 +1,431 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + +import numpy as np +from numpy.testing import assert_allclose +import pytest +import warnings + +from sklearn.datasets import make_regression +from sklearn.linear_model._glm import GeneralizedLinearRegressor +from sklearn.linear_model import ( + TweedieRegressor, + PoissonRegressor, + GammaRegressor +) +from sklearn.linear_model._glm.link import ( + IdentityLink, + LogLink, +) +from sklearn._loss.glm_distribution import ( + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, +) +from sklearn.linear_model import Ridge +from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import train_test_split + + +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression(n_samples=107, + n_features=10, + n_informative=80, noise=0.5, + random_state=2) + return X, y + + +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight.""" + # scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor() + + # Positive weights are accepted + glm.fit(X, y, sample_weight=1) + + # 2d array + weights = [[0]] + with pytest.raises(ValueError, match="must be 1D array or scalar"): + glm.fit(X, y, weights) + + # 1d but wrong length + weights = [1, 0] + msg = r"sample_weight.shape == \(2,\), expected \(1,\)!" + with pytest.raises(ValueError, match=msg): + glm.fit(X, y, weights) + + +@pytest.mark.parametrize('name, instance', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse-gaussian', InverseGaussianDistribution())]) +def test_glm_family_argument(name, instance): + """Test GLM family argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y) + assert isinstance(glm._family_instance, instance.__class__) + + glm = GeneralizedLinearRegressor(family='not a family') + with pytest.raises(ValueError, match="family must be"): + glm.fit(X, y) + + +@pytest.mark.parametrize('name, instance', + [('identity', IdentityLink()), + ('log', LogLink())]) +def test_glm_link_argument(name, instance): + """Test GLM link argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y) + assert isinstance(glm._link_instance, instance.__class__) + + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError, match="link must be"): + glm.fit(X, y) + + +@pytest.mark.parametrize('family, expected_link_class', [ + ('normal', IdentityLink), + ('poisson', LogLink), + ('gamma', LogLink), + ('inverse-gaussian', LogLink), +]) +def test_glm_link_auto(family, expected_link_class): + # Make sure link='auto' delivers the expected link function + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=family, link='auto').fit(X, y) + assert isinstance(glm._link_instance, expected_link_class) + + +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError, + match="Penalty term must be a non-negative"): + glm.fit(X, y) + + +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError, match="fit_intercept must be bool"): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver', + ['not a solver', 1, [1]]) +def test_glm_solver_argument(solver): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError, match="must be a positive integer"): + glm.fit(X, y) + + +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError, match="stopping criteria must be positive"): + glm.fit(X, y) + + +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError, match="warm_start must be bool"): + glm.fit(X, y) + + +@pytest.mark.parametrize('fit_intercept', [False, True]) +def test_glm_identity_regression(fit_intercept): + """Test GLM regression with identity link on a simple dataset.""" + coef = [1., 2.] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.dot(X, coef) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=fit_intercept, tol=1e-12) + if fit_intercept: + glm.fit(X[:, 1:], y) + assert_allclose(glm.coef_, coef[1:], rtol=1e-10) + assert_allclose(glm.intercept_, coef[0], rtol=1e-10) + else: + glm.fit(X, y) + assert_allclose(glm.coef_, coef, rtol=1e-12) + + +@pytest.mark.parametrize('fit_intercept', [False, True]) +@pytest.mark.parametrize('alpha', [0.0, 1.0]) +@pytest.mark.parametrize('family', ['normal', 'poisson', 'gamma']) +def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): + """Test that the impact of sample_weight is consistent""" + rng = np.random.RandomState(0) + n_samples, n_features = 10, 5 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + glm_params = dict(alpha=alpha, family=family, link='auto', + fit_intercept=fit_intercept) + + glm = GeneralizedLinearRegressor(**glm_params).fit(X, y) + coef = glm.coef_.copy() + + # sample_weight=np.ones(..) should be equivalent to sample_weight=None + sample_weight = np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-12) + + # sample_weight are normalized to 1 so, scaling them has no effect + sample_weight = 2*np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-12) + + # setting one element of sample_weight to 0 is equivalent to removing + # the correspoding sample + sample_weight = np.ones(y.shape) + sample_weight[-1] = 0 + glm.fit(X, y, sample_weight=sample_weight) + coef1 = glm.coef_.copy() + glm.fit(X[:-1], y[:-1]) + assert_allclose(glm.coef_, coef1, rtol=1e-12) + + # check that multiplying sample_weight by 2 is equivalent + # to repeating correspoding samples twice + X2 = np.concatenate([X, X[:n_samples//2]], axis=0) + y2 = np.concatenate([y, y[:n_samples//2]]) + sample_weight_1 = np.ones(len(y)) + sample_weight_1[:n_samples//2] = 2 + + glm1 = GeneralizedLinearRegressor(**glm_params).fit( + X, y, sample_weight=sample_weight_1 + ) + + glm2 = GeneralizedLinearRegressor(**glm_params).fit( + X2, y2, sample_weight=None + ) + assert_allclose(glm1.coef_, glm2.coef_) + + +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) +def test_glm_log_regression(fit_intercept, family): + """Test GLM regression with log link on a simple dataset.""" + coef = [0.2, -0.1] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.exp(np.dot(X, coef)) + glm = GeneralizedLinearRegressor( + alpha=0, family=family, link='log', + fit_intercept=fit_intercept, tol=1e-7) + if fit_intercept: + res = glm.fit(X[:, 1:], y) + assert_allclose(res.coef_, coef[1:], rtol=1e-6) + assert_allclose(res.intercept_, coef[0], rtol=1e-6) + else: + res = glm.fit(X, y) + assert_allclose(res.coef_, coef, rtol=2e-6) + + +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_warm_start(fit_intercept): + n_samples, n_features = 110, 10 + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features-2, noise=0.5, + random_state=42) + + glm1 = GeneralizedLinearRegressor( + warm_start=False, + fit_intercept=fit_intercept, + max_iter=1000 + ) + glm1.fit(X, y) + + glm2 = GeneralizedLinearRegressor( + warm_start=True, + fit_intercept=fit_intercept, + max_iter=1 + ) + # As we intentionally set max_iter=1, L-BFGS-B will issue a + # ConvergenceWarning which we here simply ignore. + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=ConvergenceWarning) + glm2.fit(X, y) + assert glm1.score(X, y) > glm2.score(X, y) + glm2.set_params(max_iter=1000) + glm2.fit(X, y) + # The two model are not exactly identical since the lbfgs solver + # computes the approximate hessian from previous iterations, which + # will not be strictly identical in the case of a warm start. + assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) + assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) + + +@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('sample_weight', [None, True]) +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, + sample_weight, request): + """Compare with Ridge regression for Normal distributions.""" + test_size = 10 + X, y = make_regression(n_samples=n_samples + test_size, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + random_state=42) + + if n_samples > n_features: + ridge_params = {"solver": "svd"} + else: + ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7} + + X_train, X_test, y_train, y_test, = train_test_split( + X, y, test_size=test_size, random_state=0 + ) + + alpha = 1.0 + if sample_weight is None: + sw_train = None + alpha_ridge = alpha * n_samples + else: + sw_train = np.random.RandomState(0).rand(len(y_train)) + alpha_ridge = alpha * sw_train.sum() + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha_ridge, normalize=False, + random_state=42, fit_intercept=fit_intercept, + **ridge_params) + ridge.fit(X_train, y_train, sample_weight=sw_train) + + glm = GeneralizedLinearRegressor(alpha=alpha, family='normal', + link='identity', + fit_intercept=fit_intercept, + max_iter=300, + tol=1e-5) + glm.fit(X_train, y_train, sample_weight=sw_train) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4) + assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4) + + +def test_poisson_glmnet(): + """Compare Poisson regression with L2 regularization and LogLink to glmnet + """ + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, + fit_intercept=True, family='poisson', + link='log', tol=1e-7, + max_iter=300) + glm.fit(X, y) + assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) + + +def test_convergence_warning(regression_data): + X, y = regression_data + + est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20) + with pytest.warns(ConvergenceWarning): + est.fit(X, y) + + +def test_poisson_regression_family(regression_data): + # Make sure the family attribute is read-only to prevent searching over it + # e.g. in a grid search + est = PoissonRegressor() + est.family == "poisson" + + msg = "PoissonRegressor.family must be 'poisson'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_gamma_regression_family(regression_data): + # Make sure the family attribute is read-only to prevent searching over it + # e.g. in a grid search + est = GammaRegressor() + est.family == "gamma" + + msg = "GammaRegressor.family must be 'gamma'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_tweedie_regression_family(regression_data): + # Make sure the family attribute is always a TweedieDistribution and that + # the power attribute is properly updated + power = 2.0 + est = TweedieRegressor(power=power) + assert isinstance(est.family, TweedieDistribution) + assert est.family.power == power + assert est.power == power + + new_power = 0 + new_family = TweedieDistribution(power=new_power) + est.family = new_family + assert isinstance(est.family, TweedieDistribution) + assert est.family.power == new_power + assert est.power == new_power + + msg = "TweedieRegressor.family must be of type TweedieDistribution!" + with pytest.raises(TypeError, match=msg): + est.family = None + + +@pytest.mark.parametrize( + 'estimator, value', + [ + (PoissonRegressor(), True), + (GammaRegressor(), True), + (TweedieRegressor(power=1.5), True), + (TweedieRegressor(power=0), False) + ], +) +def test_tags(estimator, value): + assert estimator._get_tags()['requires_positive_y'] is value diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py new file mode 100644 index 0000000000000..27ec4ed19bdc2 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -0,0 +1,45 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause +import numpy as np +from numpy.testing import assert_allclose +import pytest +from scipy.optimize import check_grad + +from sklearn.linear_model._glm.link import ( + IdentityLink, + LogLink, + LogitLink, +) + + +LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink] + + +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_properties(Link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100) * 100 + link = Link() + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + assert_allclose(link(link.inverse(x)), x) + # if g(h(x)) = x, then g'(h(x)) = 1/h'(x) + # g = link, h = link.inverse + assert_allclose(link.derivative(link.inverse(x)), + 1 / link.inverse_derivative(x)) + + +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_derivative(Link): + link = Link() + x = np.random.RandomState(0).rand(1) + err = check_grad(link, link.derivative, x) / link.derivative(x) + assert abs(err) < 1e-6 + + err = (check_grad(link.inverse, link.inverse_derivative, x) + / link.derivative(x)) + assert abs(err) < 1e-6 diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index 121b449d673d0..d0c9e8c04c16d 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None): # add other directories config.add_subpackage('tests') + config.add_subpackage('_glm') + config.add_subpackage('_glm/tests') return config diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index b626da5414e51..08f93d8f9a3b5 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -23,11 +23,10 @@ # Ashutosh Hathidara # License: BSD 3 clause - import numpy as np -from scipy.special import xlogy import warnings +from .._loss.glm_distribution import TweedieDistribution from ..utils.validation import (check_array, check_consistent_length, _num_samples) from ..utils.validation import column_or_1d @@ -743,7 +742,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): y_pred : array-like of shape (n_samples,) Estimated target values. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. power : float, default=0 @@ -788,47 +787,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): sample_weight = column_or_1d(sample_weight) sample_weight = sample_weight[:, np.newaxis] - message = ("Mean Tweedie deviance error with power={} can only be used on " - .format(power)) - if power < 0: - # 'Extreme stable', y_true any real number, y_pred > 0 - if (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_pred.") - dev = 2 * (np.power(np.maximum(y_true, 0), 2 - power) - / ((1 - power) * (2 - power)) - - y_true * np.power(y_pred, 1 - power)/(1 - power) - + np.power(y_pred, 2 - power)/(2 - power)) - elif power == 0: - # Normal distribution, y_true and y_pred any real number - dev = (y_true - y_pred)**2 - elif power < 1: - raise ValueError("Tweedie deviance is only defined for power<=0 and " - "power>=1.") - elif power == 1: - # Poisson distribution, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) - elif power == 2: - # Gamma distribution, y_true and y_pred > 0 - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and y_pred.") - dev = 2 * (np.log(y_pred/y_true) + y_true/y_pred - 1) - else: - if power < 2: - # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - else: - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and " - "y_pred.") - - dev = 2 * (np.power(y_true, 2 - power)/((1 - power) * (2 - power)) - - y_true * np.power(y_pred, 1 - power)/(1 - power) - + np.power(y_pred, 2 - power)/(2 - power)) + dist = TweedieDistribution(power=power) + dev = dist.unit_deviance(y_true, y_pred, check_input=True) return np.average(dev, weights=sample_weight) @@ -837,7 +797,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): """Mean Poisson deviance regression loss. Poisson deviance is equivalent to the Tweedie deviance with - the power parameter `p=1`. + the power parameter `power=1`. Read more in the :ref:`User Guide `. @@ -849,7 +809,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): y_pred : array-like of shape (n_samples,) Estimated target values. Requires y_pred > 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -874,7 +834,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): """Mean Gamma deviance regression loss. Gamma deviance is equivalent to the Tweedie deviance with - the power parameter `p=2`. It is invariant to scaling of + the power parameter `power=2`. It is invariant to scaling of the target variable, and measures relative errors. Read more in the :ref:`User Guide `. @@ -887,7 +847,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): y_pred : array-like of shape (n_samples,) Estimated target values. Requires y_pred > 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 33a8b0f30cfd6..0e89f66d0a5ed 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -135,27 +135,27 @@ def test_regression_metrics_at_limits(): mean_tweedie_deviance([0.], [0.], power=power) assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=1.0) power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) diff --git a/sklearn/setup.py b/sklearn/setup.py index cc257c30e6f43..e759cdabc88ee 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -53,6 +53,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('experimental/tests') config.add_subpackage('ensemble/_hist_gradient_boosting') config.add_subpackage('ensemble/_hist_gradient_boosting/tests') + config.add_subpackage('_loss/') + config.add_subpackage('_loss/tests') # submodules which have their own setup.py config.add_subpackage('cluster') From 4905ac3518e5a804f68d59051a955be3e5e04f2c Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Wed, 4 Mar 2020 16:26:50 +0100 Subject: [PATCH 074/103] FIX Adress decomposition.PCA mle option problem (#16224) --- doc/whats_new/v0.23.rst | 4 ++ sklearn/decomposition/_pca.py | 21 ++++++-- sklearn/decomposition/tests/test_pca.py | 65 +++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index eeaf89c174c2d..211708eca15ae 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -133,6 +133,10 @@ Changelog - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will exclusively choose the components that explain the variance greater than `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` +- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small + eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick `, and + :user:`Gelavizh Ahmadi ` and + :user:`Marija Vlajic Wheeler `. - |Enhancement| :class:`decomposition.NMF` and :func:`decomposition.non_negative_factorization` now preserves float32 dtype. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index f64a9752896b3..7a0140b01fc9b 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,7 +27,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features): +def _assess_dimension(spectrum, rank, n_samples, n_features): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -58,6 +58,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): raise ValueError("The tested rank cannot exceed the rank of the" " dataset") + spectrum_threshold = np.finfo(type(spectrum[0])).eps + pu = -rank * log(2.) for i in range(rank): pu += (gammaln((n_features - i) / 2.) - @@ -67,10 +69,14 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): pl = -pl * n_samples / 2. if rank == n_features: + # TODO: this line is never executed because _infer_dimension's + # for loop is off by one pv = 0 v = 1 else: v = np.sum(spectrum[rank:]) / (n_features - rank) + if spectrum_threshold > v: + return -np.inf pv = -np.log(v) * n_samples * (n_features - rank) / 2. m = n_features * rank - rank * (rank + 1.) / 2. @@ -80,6 +86,13 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): spectrum_ = spectrum.copy() spectrum_[rank:n_features] = v for i in range(rank): + if spectrum_[i] < spectrum_threshold: + # TODO: this line is never executed + # (off by one in _infer_dimension) + # this break only happens when rank == n_features and + # spectrum_[i] < spectrum_threshold, otherwise the early return + # above catches this case. + break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples) @@ -89,7 +102,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): return ll -def _infer_dimension_(spectrum, n_samples, n_features): +def _infer_dimension(spectrum, n_samples, n_features): """Infers the dimension of a dataset of shape (n_samples, n_features) The dataset is described by its spectrum `spectrum`. @@ -97,7 +110,7 @@ def _infer_dimension_(spectrum, n_samples, n_features): n_spectrum = len(spectrum) ll = np.empty(n_spectrum) for rank in range(n_spectrum): - ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features) + ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features) return ll.argmax() @@ -458,7 +471,7 @@ def _fit_full(self, X, n_components): # Postprocess the number of components required if n_components == 'mle': n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + _infer_dimension(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b94d2d5be7e0f..438478a55f6fa 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,8 +8,8 @@ from sklearn import datasets from sklearn.decomposition import PCA from sklearn.datasets import load_iris -from sklearn.decomposition._pca import _assess_dimension_ -from sklearn.decomposition._pca import _infer_dimension_ +from sklearn.decomposition._pca import _assess_dimension +from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto'] @@ -333,7 +333,7 @@ def test_infer_dim_1(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) + ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n @@ -348,7 +348,7 @@ def test_infer_dim_2(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 1 + assert _infer_dimension(spect, n, p) > 1 def test_infer_dim_3(): @@ -361,7 +361,7 @@ def test_infer_dim_3(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 2 + assert _infer_dimension(spect, n, p) > 2 @pytest.mark.parametrize( @@ -568,3 +568,58 @@ def test_pca_n_components_mostly_explained_variance_ratio(): n_components = pca1.explained_variance_ratio_.cumsum()[-2] pca2 = PCA(n_components=n_components).fit(X, y) assert pca2.n_components_ == X.shape[1] + + +def test_infer_dim_bad_spec(): + # Test a spectrum that drops to near zero for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + ret = _infer_dimension(spectrum, n_samples, n_features) + assert ret == 0 + + +def test_assess_dimension_error_rank_greater_than_features(): + # Test error when tested rank is greater than the number of features + # for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 4 + rank = 5 + with pytest.raises(ValueError, match="The tested rank cannot exceed " + "the rank of the dataset"): + _assess_dimension(spectrum, rank, n_samples, n_features) + + +def test_assess_dimension_small_eigenvalues(): + # Test tiny eigenvalues appropriately when using 'mle' + # for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + rank = 3 + ret = _assess_dimension(spectrum, rank, n_samples, n_features) + assert ret == -np.inf + + +def test_infer_dim_mle(): + # Test small eigenvalues when 'mle' with pathological 'X' dataset + # for PR #16224 + X, _ = datasets.make_classification(n_informative=1, n_repeated=18, + n_redundant=1, n_clusters_per_class=1, + random_state=42) + pca = PCA(n_components='mle').fit(X) + assert pca.n_components_ == 0 + + +def test_fit_mle_too_few_samples(): + # Tests that an error is raised when the number of samples is smaller + # than the number of features during an mle fit for PR #16224 + X, _ = datasets.make_classification(n_samples=20, n_features=21, + random_state=42) + + pca = PCA(n_components='mle', svd_solver='full') + with pytest.raises(ValueError, match="n_components='mle' is only " + "supported if " + "n_samples >= n_features"): + pca.fit(X) From fdbff6cc78bddb57057f1b48ce00b4b38e9d99b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 4 Mar 2020 17:29:39 +0100 Subject: [PATCH 075/103] DOC add 0.22.2 in website news (#16631) --- doc/templates/index.html | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/templates/index.html b/doc/templates/index.html index 0f43677e668f5..e17111fb48eef 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -158,6 +158,7 @@

    News

  1. Scikit-learn from 0.23 requires Python 3.6 or greater.
  2. +
  3. January 2020. scikit-learn 0.22.1 is available for download (Changelog).
  4. December 2019. scikit-learn 0.22 is available for download (Changelog).
  5. From 43293a67ed65a763d287d52ad098fd644ef5c6e2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 4 Mar 2020 22:33:59 +0100 Subject: [PATCH 076/103] TST Enable california_housing pandas test in cron job (#16547) --- sklearn/datasets/tests/conftest.py | 14 ++++++++++++++ .../datasets/tests/test_california_housing.py | 19 ++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 85242d7335685..fdb9516e62a27 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,5 +1,6 @@ """ Network tests are only run, if data is already locally available, or if download is specifically requested by environment variable.""" +import builtins from os import environ import pytest from sklearn.datasets import fetch_20newsgroups @@ -59,3 +60,16 @@ def fetch_olivetti_faces_fxt(): @pytest.fixture def fetch_rcv1_fxt(): return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') + + +@pytest.fixture +def hide_available_pandas(monkeypatch): + """ Pretend pandas was not installed. """ + import_orig = builtins.__import__ + + def mocked_import(name, *args, **kwargs): + if name == 'pandas': + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, '__import__', mocked_import) diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index af1e1ff1370e1..a8c5514e2ec73 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -27,14 +27,11 @@ def test_fetch_asframe(fetch_california_housing_fxt): assert isinstance(bunch.target, pd.Series) -def test_pandas_dependency_message(fetch_california_housing_fxt): - try: - import pandas # noqa - pytest.skip("This test requires pandas to be not installed") - except ImportError: - # Check that pandas is imported lazily and that an informative error - # message is raised when pandas is missing: - expected_msg = ('fetch_california_housing with as_frame=True' - ' requires pandas') - with pytest.raises(ImportError, match=expected_msg): - fetch_california_housing_fxt(as_frame=True) +def test_pandas_dependency_message(fetch_california_housing_fxt, + hide_available_pandas): + # Check that pandas is imported lazily and that an informative error + # message is raised when pandas is missing: + expected_msg = ('fetch_california_housing with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_california_housing_fxt(as_frame=True) From 0271b769db4eefe51fcaee7c81cc38ddb024f9bc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 11:55:25 +0100 Subject: [PATCH 077/103] EXA align lorenz curves between the two examples with GLMs (#16640) --- .../plot_poisson_regression_non_normal_loss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index ee863dd4198ba..4b0386edfcdf6 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -393,11 +393,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # # To compare the 3 models within this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by -# the model predictions, from riskiest to safest according to each model: +# the model predictions, from safest to riskiest according to each model: def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest + idx_sort = np.argsort(y_pred) # from safest to riskiest sorted_exposure = exposure[idx_sort] sorted_frequencies = y_true[idx_sort] cumulated_exposure = np.cumsum(sorted_exposure) @@ -434,10 +434,10 @@ def _cumulated_claims(y_true, y_pred, exposure): label="Random baseline") ax.set( title="Cumulated number of claims by model", - xlabel='Fraction of exposure (from riskiest to safest)', + xlabel='Fraction of exposure (from safest to riskiest)', ylabel='Fraction of number of claims' ) -ax.legend(loc="lower right") +ax.legend(loc="upper left") ############################################################################## # This plot reveals that the random forest model is slightly better at ranking From f7dfe4d874fe043461a1f9d86dbe3d7b00202f8a Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Thu, 5 Mar 2020 21:53:00 +0100 Subject: [PATCH 078/103] DOC update n_jobs description in DBSCAN (#16615) Co-authored-by: JohanWork --- sklearn/cluster/_dbscan.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 6a33f411886b0..52c962052f9bc 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -83,10 +83,11 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) - The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. + The number of parallel jobs to run for neighbors search. ``None`` means + 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means + using all processors. See :term:`Glossary ` for more details. + If precomputed distance are used, parallel execution is not available + and thus n_jobs will have no effect. Returns ------- From 2131504345a38776ce604216b9099f2d176fd84f Mon Sep 17 00:00:00 2001 From: wderose Date: Fri, 6 Mar 2020 13:51:32 -0800 Subject: [PATCH 079/103] FIX Pass sample_weight when predicting on stacked folds (#16539) --- doc/whats_new/v0.23.rst | 7 +++++++ sklearn/ensemble/_stacking.py | 26 ++++++++++--------------- sklearn/ensemble/tests/test_stacking.py | 14 +++++++++++++ sklearn/utils/_mocking.py | 1 + 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 211708eca15ae..d5a6a7bfb5015 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -182,6 +182,13 @@ Changelog used during `fit`. :pr:`16437` by :user:`Jin-Hwan CHO `. +- |Fix| Fixed a bug in :class:`ensemble.StackingClassifier` and + :class:`ensemble.StackingRegressor` where the `sample_weight` + argument was not being passed to `cross_val_predict` when + evaluating the base estimators on cross-validation folds + to obtain the input to the meta estimator. + :pr:`16539` by :user:`Bill DeRose `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index cd18a684a4518..ba817613523f6 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -122,6 +122,10 @@ def fit(self, X, y, sample_weight=None): Note that this is supported only if all underlying estimators support sample weights. + .. versionchanged:: 0.23 + when not None, `sample_weight` is passed to all underlying + estimators + Returns ------- self : object @@ -166,10 +170,13 @@ def fit(self, X, y, sample_weight=None): self._method_name(name, est, meth) for name, est, meth in zip(names, all_estimators, stack_method) ] - + fit_params = ({"sample_weight": sample_weight} + if sample_weight is not None + else None) predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, + fit_params=fit_params, verbose=self.verbose) for est, meth in zip(all_estimators, self.stack_method_) if est != 'drop' @@ -183,21 +190,8 @@ def fit(self, X, y, sample_weight=None): ] X_meta = self._concatenate_predictions(X, predictions) - if sample_weight is not None: - try: - self.final_estimator_.fit( - X_meta, y, sample_weight=sample_weight - ) - except TypeError as exc: - if "unexpected keyword argument 'sample_weight'" in str(exc): - raise TypeError( - "Underlying estimator {} does not support sample " - "weights." - .format(self.final_estimator_.__class__.__name__) - ) from exc - raise - else: - self.final_estimator_.fit(X_meta, y) + _fit_single_estimator(self.final_estimator_, X_meta, y, + sample_weight=sample_weight) return self diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1eff7ba5f7de7..f8a3f290e96b5 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -38,6 +38,7 @@ from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold +from sklearn.utils._mocking import CheckingClassifier from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import ignore_warnings @@ -439,6 +440,19 @@ def test_stacking_with_sample_weight(stacker, X, y): assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 +def test_stacking_classifier_sample_weight_fit_param(): + # check sample_weight is passed to all invocations of fit + stacker = StackingClassifier( + estimators=[ + ('lr', CheckingClassifier(expected_fit_params=['sample_weight'])) + ], + final_estimator=CheckingClassifier( + expected_fit_params=['sample_weight'] + ) + ) + stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0])) + + @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.parametrize( "stacker, X, y", diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 25b60f7955b99..cff4183ea9bc4 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -95,6 +95,7 @@ def fit(self, X, y, **fit_params): assert self.check_X(X) if self.check_y is not None: assert self.check_y(y) + self.n_features_in_ = len(X) self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: From 68d1fefabc744b36628fad5596259f470b480b6d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 9 Mar 2020 06:37:53 -0400 Subject: [PATCH 080/103] BLD Turns off memory_profiler in examples to fix CircleCI (#16629) --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index b7eb635b15f40..d8350a9713ebd 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -297,7 +297,7 @@ def __call__(self, directory): sphinx_gallery_conf = { 'doc_module': 'sklearn', 'backreferences_dir': os.path.join('modules', 'generated'), - 'show_memory': True, + 'show_memory': False, 'reference_url': { 'sklearn': None}, 'examples_dirs': ['../examples'], From a5a82ab17de524f54269f853cba93e2c5a737ed7 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 9 Mar 2020 06:38:26 -0400 Subject: [PATCH 081/103] BLD Updates osx vm image in azure pipelines (#16647) --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index df504a4ab3bf7..1aad015849b2e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -133,7 +133,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: macOS - vmImage: xcode9-macos10.13 + vmImage: macOS-10.14 dependsOn: [linting] matrix: pylatest_conda_mkl: From 9c17a60384645d711656dbf12af2b52787f8e7e6 Mon Sep 17 00:00:00 2001 From: Maura Pintor Date: Tue, 10 Mar 2020 05:24:26 +0100 Subject: [PATCH 082/103] FIX: normalizer l_inf should take maximum of absolute values (#16633) --- doc/whats_new/v0.23.rst | 5 +++++ sklearn/preprocessing/_data.py | 11 +++++++---- sklearn/preprocessing/tests/test_data.py | 23 ++++++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index d5a6a7bfb5015..4f3b89dfaee94 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -363,6 +363,11 @@ Changelog computing statistics when calling `partial_fit` on sparse inputs. :pr:`16466` by :user:`Guillaume Lemaitre `. +- |Fix| Fix a bug in :class:`preprocessing.Normalizer` with norm='max', + which was not taking the absolute value of the maximum values before + normalizing the vectors. :pr:`16632` by + :user:`Maura Pintor ` and :user:`Battista Biggio `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 72ad6bacd43b4..33e2bac562489 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1718,7 +1718,8 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): elif norm == 'l2': inplace_csr_row_normalize_l2(X) elif norm == 'max': - _, norms = min_max_axis(X, 1) + mins, maxes = min_max_axis(X, 1) + norms = np.maximum(abs(mins), maxes) norms_elementwise = norms.repeat(np.diff(X.indptr)) mask = norms_elementwise != 0 X.data[mask] /= norms_elementwise[mask] @@ -1728,7 +1729,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): elif norm == 'l2': norms = row_norms(X) elif norm == 'max': - norms = np.max(X, axis=1) + norms = np.max(abs(X), axis=1) norms = _handle_zeros_in_scale(norms, copy=False) X /= norms[:, np.newaxis] @@ -1746,7 +1747,7 @@ class Normalizer(TransformerMixin, BaseEstimator): Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so - that its norm (l1 or l2) equals one. + that its norm (l1, l2 or inf) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of @@ -1763,7 +1764,9 @@ class Normalizer(TransformerMixin, BaseEstimator): Parameters ---------- norm : 'l1', 'l2', or 'max', optional ('l2' by default) - The norm to use to normalize each non zero sample. + The norm to use to normalize each non zero sample. If norm='max' + is used, values will be rescaled by the maximum of the absolute + values. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 95721a0508091..7999df083631c 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1947,7 +1947,7 @@ def test_normalizer_max(): X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): - row_maxs = X_norm.max(axis=1) + row_maxs = abs(X_norm).max(axis=1) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(row_maxs[3], 0.0) @@ -1966,6 +1966,27 @@ def test_normalizer_max(): assert_almost_equal(la.norm(X_norm[3]), 0.0) +def test_normalizer_max_sign(): + # check that we normalize by a positive number even for negative data + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + # set the row number 3 to zero + X_dense[3, :] = 0.0 + # check for mixed data where the value with + # largest magnitude is negative + X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 + X_all_neg = -np.abs(X_dense) + X_all_neg_sparse = sparse.csr_matrix(X_all_neg) + + for X in (X_dense, X_all_neg, X_all_neg_sparse): + normalizer = Normalizer(norm='max') + X_norm = normalizer.transform(X) + assert X_norm is not X + X_norm = toarray(X_norm) + assert_array_equal( + np.sign(X_norm), np.sign(toarray(X))) + + def test_normalize(): # Test normalize function # Only tests functionality not used by the tests for Normalizer. From 16f4208ca30c5bb43125d3cad97e08015102477e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 09:58:55 +0100 Subject: [PATCH 083/103] ENH Add check for non binary variables in OneHotEncoder. (#16585) Co-authored-by: Thomas J Fan Co-authored-by: Guillaume Lemaitre --- sklearn/preprocessing/_encoders.py | 47 ++++++++++---------- sklearn/preprocessing/tests/test_encoders.py | 27 ++++++++--- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3be4540498591..86be9d335bd9e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder): (if any). drop_idx_ : array of shape (n_features,) - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to - be dropped for each feature. - ``drop_idx_[i] = -1`` if no category is to be dropped from the feature - with index ``i``, e.g. when `drop='if_binary'` and the feature isn't - binary - - ``drop_idx_ = None`` if all the transformed features will be retained. + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. See Also -------- @@ -316,10 +316,10 @@ def _compute_drop_idx(self): return None elif isinstance(self.drop, str): if self.drop == 'first': - return np.zeros(len(self.categories_), dtype=np.int_) + return np.zeros(len(self.categories_), dtype=np.object) elif self.drop == 'if_binary': - return np.array([0 if len(cats) == 2 else -1 - for cats in self.categories_], dtype=np.int_) + return np.array([0 if len(cats) == 2 else None + for cats in self.categories_], dtype=np.object) else: msg = ( "Wrong input for parameter `drop`. Expected " @@ -354,7 +354,8 @@ def _compute_drop_idx(self): raise ValueError(msg) return np.array([np.where(cat_list == val)[0][0] for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.int_) + zip(self.drop, self.categories_)], + dtype=np.object) def fit(self, X, y=None): """ @@ -421,7 +422,7 @@ def transform(self, X): n_samples, n_features = X_int.shape - if self.drop is not None: + if self.drop_idx_ is not None: to_drop = self.drop_idx_.copy() # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. @@ -431,7 +432,7 @@ def transform(self, X): n_cats = len(cats) # drop='if_binary' but feature isn't binary - if to_drop[i] == -1: + if to_drop[i] is None: # set to cardinality to not drop from X_int to_drop[i] = n_cats n_values.append(n_cats) @@ -484,16 +485,14 @@ def inverse_transform(self, X): n_samples, _ = X.shape n_features = len(self.categories_) - if self.drop is None: + if self.drop_idx_ is None: n_transformed_features = sum(len(cats) for cats in self.categories_) - elif isinstance(self.drop, str) and self.drop == 'if_binary': - n_transformed_features = sum(1 if len(cats) == 2 - else len(cats) - for cats in self.categories_) else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) + n_transformed_features = sum( + len(cats) - 1 if to_drop is not None else len(cats) + for cats, to_drop in zip(self.categories_, self.drop_idx_) + ) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " @@ -509,7 +508,7 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - if self.drop is None: + if self.drop_idx_ is None or self.drop_idx_[i] is None: cats = self.categories_[i] else: cats = np.delete(self.categories_[i], self.drop_idx_[i]) @@ -532,9 +531,9 @@ def inverse_transform(self, X): if unknown.any(): found_unknown[i] = unknown # drop will either be None or handle_unknown will be error. If - # self.drop is not None, then we can safely assume that all of + # self.drop_idx_ is not None, then we can safely assume that all of # the nulls in each column are the dropped value - elif self.drop is not None: + elif self.drop_idx_ is not None: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] @@ -581,7 +580,7 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if self.drop is not None: + if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 2a872c2e06c49..7e23aa2d485c2 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -268,6 +268,22 @@ def test_one_hot_encoder_inverse_if_binary(): assert_array_equal(ohe.inverse_transform(X_tr), X) +# check that resetting drop option without refitting does not throw an error +@pytest.mark.parametrize('drop', ['if_binary', 'first', None]) +@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None]) +def test_one_hot_encoder_drop_reset(drop, reset_drop): + X = np.array([['Male', 1], + ['Female', 3], + ['Female', 2]], dtype=object) + ohe = OneHotEncoder(drop=drop, sparse=False) + ohe.fit(X) + X_tr = ohe.transform(X) + feature_names = ohe.get_feature_names() + ohe.set_params(drop=reset_drop) + assert_array_equal(ohe.inverse_transform(X_tr), X) + assert_allclose(ohe.transform(X), X_tr) + assert_array_equal(ohe.get_feature_names(), feature_names) + @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @pytest.mark.parametrize("X", [ [1, 2], @@ -388,8 +404,9 @@ def test_one_hot_encoder_pandas(): @pytest.mark.parametrize("drop, expected_names", [('first', ['x0_c', 'x2_b']), + ('if_binary', ['x0_c', 'x1_2', 'x2_b']), (['c', 2, 'b'], ['x0_b', 'x2_a'])], - ids=['first', 'manual']) + ids=['first', 'binary', 'manual']) def test_one_hot_encoder_feature_names_drop(drop, expected_names): X = [['c', 2, 'a'], ['b', 2, 'b']] @@ -409,7 +426,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 0., 0., 1.], [0., 1., 0., 0.], [0., 0., 1., 1.]]) - expected_drop_idx = np.array([-1, 0]) + expected_drop_idx = np.array([None, 0]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -423,7 +440,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 1.], [0., 1.], [0., 1.]]) - expected_drop_idx = np.array([0, -1]) + expected_drop_idx = np.array([0, None]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -662,9 +679,9 @@ def test_categories(density, drop): for drop_cat, drop_idx, cat_list in zip(drop, ohe_test.drop_idx_, ohe_test.categories_): - assert cat_list[drop_idx] == drop_cat + assert cat_list[int(drop_idx)] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) - assert ohe_test.drop_idx_.dtype == np.int_ + assert ohe_test.drop_idx_.dtype == np.object @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) From d7fbef09ca62444f82893b3ccd5fb0947f165c2f Mon Sep 17 00:00:00 2001 From: Himanshu Garg <35988194+merrcury@users.noreply.github.com> Date: Tue, 10 Mar 2020 14:45:58 +0530 Subject: [PATCH 084/103] DOC Update LICENSE Year (#16660) --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 0f665f8400d08..b98af18710185 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ New BSD License -Copyright (c) 2007–2019 The scikit-learn developers. +Copyright (c) 2007–2020 The scikit-learn developers. All rights reserved. From 8c8383b2173e9b29d13b131e0cdaea5550392014 Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Tue, 10 Mar 2020 09:52:47 +0000 Subject: [PATCH 085/103] BUG Fix issue with KernelPCA.inverse_transform (#16655) --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/decomposition/_kernel_pca.py | 3 ++- sklearn/decomposition/tests/test_kernel_pca.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 4f3b89dfaee94..42fc0cdbdee6d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -142,6 +142,10 @@ Changelog :func:`decomposition.non_negative_factorization` now preserves float32 dtype. :pr:`16280` by :user:`Jeremie du Boisberranger `. +- |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now + applies the correct inverse transform to the transformed data. :pr:`16655` + by :user:`Lewis Ball `. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index b1f83c8e0ff81..6f15ebc29f761 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -358,5 +358,6 @@ def inverse_transform(self, X): "the inverse transform is not available.") K = self._get_kernel(X, self.X_transformed_fit_) - + n_samples = self.X_transformed_fit_.shape[0] + K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index a08ae0cb7a43a..a7a9547bfa33a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -7,6 +7,7 @@ from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles +from sklearn.datasets import make_blobs from sklearn.linear_model import Perceptron from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV @@ -282,3 +283,15 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_)) + + +@pytest.mark.parametrize("kernel", + ["linear", "poly", "rbf", "sigmoid", "cosine"]) +def test_kernel_pca_inverse_transform(kernel): + X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]], + random_state=0) + + kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True) + X_trans = kp.fit_transform(X) + X_inv = kp.inverse_transform(X_trans) + assert_allclose(X, X_inv) From 895cb6a123fa8571c70ff956e8c6d11e90ccd1f9 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 23:59:23 +0100 Subject: [PATCH 086/103] [MRG] DOC Add example about interpretation of coefficients of linear models (#15706) Co-authored-by: Nicolas Hug Co-authored-by: Gael Varoquaux Co-authored-by: Joel Nothman Co-authored-by: Guillaume Lemaitre --- README.rst | 3 +- build_tools/circle/build_doc.sh | 2 +- doc/conf.py | 1 + doc/developers/contributing.rst | 3 +- doc/inspection.rst | 4 + doc/install.rst | 3 +- doc/modules/linear_model.rst | 3 +- .../scikit-learn-modern/static/css/theme.css | 38 + ...linear_model_coefficient_interpretation.py | 647 ++++++++++++++++++ 9 files changed, 699 insertions(+), 5 deletions(-) create mode 100644 examples/inspection/plot_linear_model_coefficient_interpretation.py diff --git a/README.rst b/README.rst index fa0b665bbc8dd..171a19785dd73 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,8 @@ scikit-learn 0.23 and later require Python 3.6 or newer. Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and classes end with "Display") require Matplotlib (>= 2.1.1). For running the examples Matplotlib >= 2.1.1 is required. A few examples require -scikit-image >= 0.13, a few examples require pandas >= 0.18.0. +scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples +require seaborn >= 0.9.0. User installation ~~~~~~~~~~~~~~~~~ diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index abc823facee15..b0429e41762b1 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -169,7 +169,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \ cython="${CYTHON_VERSION:-*}" pytest coverage \ matplotlib="${MATPLOTLIB_VERSION:-*}" sphinx=2.1.2 pillow \ scikit-image="${SCIKIT_IMAGE_VERSION:-*}" pandas="${PANDAS_VERSION:-*}" \ - joblib memory_profiler packaging + joblib memory_profiler packaging seaborn source activate testenv pip install sphinx-gallery diff --git a/doc/conf.py b/doc/conf.py index d8350a9713ebd..c3ab17d3e73af 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -250,6 +250,7 @@ 'matplotlib': ('https://matplotlib.org/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'joblib': ('https://joblib.readthedocs.io/en/latest/', None), + 'seaborn': ('https://seaborn.pydata.org/', None), } v = parse(release) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 3b2f7317ee41b..00d0c9a240c60 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -553,7 +553,8 @@ the development version. Building the documentation requires installing some additional packages:: - pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas scikit-image packaging + pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas \ + scikit-image packaging seaborn To build the documentation, you need to be in the ``doc`` folder:: diff --git a/doc/inspection.rst b/doc/inspection.rst index b53aeb436b4cd..1304a1030abb9 100644 --- a/doc/inspection.rst +++ b/doc/inspection.rst @@ -17,6 +17,10 @@ predictions from a model and what affects them. This can be used to evaluate assumptions and biases of a model, design a better model, or to diagnose issues with model performance. +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` + .. toctree:: modules/partial_dependence diff --git a/doc/install.rst b/doc/install.rst index 6a2b83605c1a6..9f8c277577a3c 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -134,7 +134,8 @@ it as ``scikit-learn[alldeps]``. Scikit-learn plotting capabilities (i.e., functions start with "plot\_" and classes end with "Display") require Matplotlib (>= 2.1.1). For running the examples Matplotlib >= 2.1.1 is required. A few examples require -scikit-image >= 0.13, a few examples require pandas >= 0.18.0. +scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples +require seaborn >= 0.9.0. .. warning:: diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fc5f254035a53..477baca9c4de3 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -146,7 +146,7 @@ a linear kernel. * :ref:`sphx_glr_auto_examples_linear_model_plot_ridge_path.py` * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` Ridge Complexity ---------------- @@ -232,6 +232,7 @@ computes the coefficients along the full path of possible values. * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` * :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py` + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` .. note:: **Feature selection with Lasso** diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index a77fb03e36f65..2b80d6fe2b762 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -963,6 +963,44 @@ div.sphx-glr-thumbcontainer { } } +/* Pandas dataframe css */ +/* Taken from: https://github.com/spatialaudio/nbsphinx/blob/fb3ba670fc1ba5f54d4c487573dbc1b4ecf7e9ff/src/nbsphinx.py#L587-L619 */ +/* FIXME: to be removed when sphinx-gallery >= 5.0 will be released */ + +table.dataframe { + border: none !important; + border-collapse: collapse; + border-spacing: 0; + border-color: transparent; + color: black; + font-size: 12px; + table-layout: fixed; +} +table.dataframe thead { + border-bottom: 1px solid black; + vertical-align: bottom; +} +table.dataframe tr, +table.dataframe th, +table.dataframe td { + text-align: right; + vertical-align: middle; + padding: 0.5em 0.5em; + line-height: normal; + white-space: normal; + max-width: none; + border: none; +} +table.dataframe th { + font-weight: bold; +} +table.dataframe tbody tr:nth-child(odd) { + background: #f5f5f5; +} +table.dataframe tbody tr:hover { + background: rgba(66, 165, 245, 0.2); +} + /* rellinks */ .sk-btn-rellink { diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py new file mode 100644 index 0000000000000..8fdad51a3ff7f --- /dev/null +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -0,0 +1,647 @@ +""" +================================================================== +Common pitfalls in interpretation of coefficients of linear models +================================================================== + +Linear models describe situations in which the target value is expected to be +a linear combination of the features (see the :ref:`linear_model` User Guide +section for a description of a set of linear model methods available in +scikit-learn). +Coefficients in multiple linear models represent the relationship between the +given feature, :math:`X_i` and the target, :math:`y`, assuming that all the +other features remain constant (`conditional dependence +`_). +This is different from plotting :math:`X_i` versus :math:`y` and fitting a +linear relationship: in that case all possible values of the other features are +taken into account in the estimation (marginal dependence). + +This example will provide some hints in interpreting coefficient in linear +models, pointing at problems that arise when either the linear model is not +appropriate to describe the dataset, or when features are correlated. + +We will use data from the `"Current Population Survey" +`_ from 1985 to predict +wage as a function of various features such as experience, age, or education. + +.. contents:: + :local: + :depth: 1 +""" + +print(__doc__) + +import numpy as np +import scipy as sp +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +############################################################################# +# The dataset: wages +# ------------------ +# +# We fetch the data from `OpenML `_. +# Note that setting the parameter `as_frame` to True will retrieve the data +# as a pandas dataframe. + +from sklearn.datasets import fetch_openml + +survey = fetch_openml(data_id=534, as_frame=True) + +############################################################################## +# Then, we identify features `X` and targets `y`: the column WAGE is our +# target variable (i.e., the variable which we want to predict). +# +X = survey.data[survey.feature_names] +X.describe(include="all") + +############################################################################## +# Notice that the dataset contains categorical and numerical variables. +# This will give us directions on how to preprocess the data thereafter. + +X.head() + +############################################################################## +# Our target for prediction: the wage. +# Wages are described as floating-point number in :math:`k$` +y = survey.target.values.ravel() +survey.target.head() + +############################################################################### +# We split the sample into a train and a test dataset. +# Only the train dataset will be used in the following exploratory analysis. +# This is a way to emulate a real situation where predictions are performed on +# an unknown target, and we don't want our analysis and decisions to be biased +# by our knowledge of the test data. + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=42 +) + +############################################################################## +# First, let's get some insights by looking at the variable distributions and +# at the pairwise relationships between them. Only numerical +# variables will be used. In the following plot, each dot represents a sample. +# +# .. _marginal_dependencies: + +train_dataset = X_train.copy() +train_dataset.insert(0, "WAGE", y_train) +_ = sns.pairplot(train_dataset, kind='reg', diag_kind='kde') + +############################################################################## +# Looking closely at the WAGE distribution it could be noticed that it has a +# long tail and we could take its logarithm +# to simplify our problem and approximate a normal distribution. +# The WAGE is increasing when EDUCATION is increasing. +# It should be noted that the dependence between WAGE and EDUCATION +# represented here is a marginal dependence, i.e., it describe the behavior +# of a specific variable without fixing the others. +# Also, the EXPERIENCE and AGE are linearly correlated. +# +# .. _the-pipeline: +# +# The machine-learning pipeline +# ----------------------------- +# +# To design our machine-learning pipeline, we manually +# check the type of data that we are dealing with: + +survey.data.info() + +############################################################################# +# As seen previously, the dataset contains columns with different data types +# and we need to apply a specific preprocessing for each data types. +# In particular categorical variables cannot be included in linear model if not +# coded as integers first. In addition, to avoid categorical features to be +# treated as ordered values, we need to one-hot-encode them. +# Our pre-processor will +# +# - one-hot encode (i.e., generate a column by category) the categorical +# columns; +# - as a first approach (we will see after how the normalisation of numerical +# values will affect our discussion), keep numerical values as they are. + +from sklearn.compose import make_column_transformer +from sklearn.preprocessing import OneHotEncoder + +categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR', + 'MARR', 'UNION', 'SEX', 'SOUTH'] +numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] + +preprocessor = make_column_transformer( + (OneHotEncoder(drop='if_binary'), categorical_columns), + remainder='passthrough' +) + +############################################################################## +# To describe the dataset as a linear model we choose to use a ridge regressor +# with a very small regularization and to model the logarithm of the WAGE. + + +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import Ridge +from sklearn.compose import TransformedTargetRegressor + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=Ridge(alpha=1e-10), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +############################################################################## +# Processing the dataset +# ---------------------- +# +# First, we fit the model. + +_ = model.fit(X_train, y_train) + +############################################################################## +# Then we check the performance of the computed model plotting its predictions +# on the test set and computing, +# for example, the median absolute error of the model. + +from sklearn.metrics import median_absolute_error + +y_pred = model.predict(X_train) + +mae = median_absolute_error(y_train, y_pred) +string_score = f'MAE on training set: {mae:.2f} $/hour' +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' +fig, ax = plt.subplots(figsize=(5, 5)) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") +plt.text(3, 20, string_score) +plt.title('Ridge model, small regularization') +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +_ = plt.ylim([0, 27]) + +############################################################################## +# The model learnt is far from being a good model making accurate predictions: +# this is obvious when looking at the plot above, where good predictions +# should lie on the red line. +# In the following section, we will interpret the coefficients of the model. +# While we do so, we should keep in mind that any conclusion we way draw will +# be about +# the model that we build, rather than about the true (real-world) generative +# process of the data. +# +# Interpreting coefficients: scale matters +# --------------------------------------------- +# +# First of all, we can take a look to the values of the coefficients of the +# regressor we have fitted. + +feature_names = (model.named_steps['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names(input_features=categorical_columns)) +feature_names = np.concatenate( + [feature_names, numerical_columns]) + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) + +coefs + +############################################################################## +# The AGE coefficient is expressed in +# :math:`$/hours/(living\ years)` while the EDUCATION one is expressed +# in :math:`$/hours/(years\ of\ education)`. +# This representation of the coefficients has the advantage of making clear +# the practical predictions of the model: +# an increase of :math:`1` year in AGE means a decrease of :math:`0.030867$`, +# while an increase of :math:`1` year in EDUCATION means an increase of +# :math:`0.054699$`. +# On the other hand, categorical variables (as UNION or SEX) are adimensional +# numbers taking the value either of 0 or 1. Their coefficients are expressed +# in :math:`$/hours`. Then, we cannot compare the magnitude of different +# coefficients since the features have different natural scales, and hence +# value ranges, because of their different unit of measure. +# This is more evident if we plot the coefficients. + +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################### +# Indeed, from the plot above the most important factor in determining WAGE +# appears to be the +# variable UNION, even if it is plausible that variables like EXPERIENCE +# should have more impact. +# Looking at the coefficient plot to extrapolate feature importance could be +# misleading as some of them vary on a small scale, while others, like AGE, +# varies a lot more, several decades. +# This is evident if we compare feature standard deviations. + +X_train_preprocessed = pd.DataFrame( + model.named_steps['columntransformer'].transform(X_train), + columns=feature_names +) + +X_train_preprocessed.std(axis=0).plot(kind='barh', figsize=(9, 7)) +plt.title('Features std. dev.') +plt.subplots_adjust(left=.3) + +############################################################################### +# Multiplying the coefficients by the standard deviation of the related +# feature would reduce all the coefficients to the same unit of measure. +# As we will see :ref:`after` this is equivalent to normalize +# numerical variables to their standard deviation, +# as :math:`y = \sum{coef_i \times X_i} = +# \sum{(coef_i \times std_i) \times (X_i / std_i)}`. +# +# In that way, we emphasize that the +# greater the variance of a feature, the larger the weight of the corresponding +# coefficient on the output, all else being equal. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std(axis=0), + columns=['Coefficient importance'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################### +# Now that the coefficients have been scaled, we can safely compare them. +# +# .. warning:: +# +# Why does the plot above suggest that an increase in age leads to a +# decrease in wage? Why the :ref:`initial pairplot +# ` is telling the opposite? +# +# The plot above tells us about dependencies between a specific feature and +# the target when all other features remain constant, i.e., **conditional +# dependencies**. An increase of the AGE will induce a decrease +# of the WAGE when all other features remain constant. On the contrary, an +# increase of the EXPERIENCE will induce an increase of the WAGE when all +# other features remain constant. +# Also, AGE, EXPERIENCE and EDUCATION are the three variables that most +# influence the model. +# +# Checking the variability of the coefficients +# -------------------------------------------- +# +# We can check the coefficient variability through cross-validation. +# If coefficients vary in a significant way changing the input dataset +# their robustness is not guaranteed, and they should probably be interpreted +# with caution. + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedKFold + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std(axis=0) + for est in cv_model['estimator']], + columns=feature_names +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) +plt.axvline(x=0, color='.5') +plt.xlabel('Coefficient importance') +plt.title('Coefficient importance and its variability') +plt.subplots_adjust(left=.3) + +############################################################################### +# The problem of correlated variables +# ----------------------------------- +# +# The AGE and EXPERIENCE coefficients are affected by strong variability which +# might be due to the collinearity between the 2 features: as AGE and +# EXPERIENCE vary together in the data, their effect is difficult to tease +# apart. +# To verify this interpretation we plot the variability of the AGE and +# EXPERIENCE coefficient. +# +# .. _covariation: + +plt.ylabel('Age coefficient') +plt.xlabel('Experience coefficient') +plt.grid(True) +plt.xlim(-0.4, 0.5) +plt.ylim(-0.4, 0.5) +plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE ' + 'across folds') + +############################################################################### +# Two regions are populated: when the EXPERIENCE coefficient is +# positive the AGE one is negative and viceversa. +# +# To go further we remove one of the 2 features and check what is the impact +# on the model stability. + +column_to_drop = ['AGE'] + +cv_model = cross_validate( + model, X.drop(columns=column_to_drop), y, + cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.drop(columns=column_to_drop).std(axis=0) + for est in cv_model['estimator']], + columns=feature_names[:-1] +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) +plt.axvline(x=0, color='.5') +plt.title('Coefficient importance and its variability') +plt.xlabel('Coefficient importance') +plt.subplots_adjust(left=.3) + +############################################################################### +# The estimation of the EXPERIENCE coefficient is now less variable and +# remain important for all models trained during cross-validation. +# +# .. _scaling_num: +# +# Preprocessing numerical variables +# --------------------------------- +# +# As said above (see ":ref:`the-pipeline`"), we could also choose to scale +# numerical values before training the model. +# This can be useful to apply a similar amount regularization to all of them +# in the Ridge. +# The preprocessor is redefined in order to subtract the mean and scale +# variables to unit variance. + +from sklearn.preprocessing import StandardScaler + +preprocessor = make_column_transformer( + (OneHotEncoder(drop='if_binary'), categorical_columns), + (StandardScaler(), numerical_columns), + remainder='passthrough' +) + +############################################################################### +# The model will stay unchanged. + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=Ridge(alpha=1e-10), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +_ = model.fit(X_train, y_train) + +############################################################################## +# Again, we check the performance of the computed +# model using, for example, the median absolute error of the model and the R +# squared coefficient. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = f'MAE on training set: {mae:.2f} $/hour' +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' +fig, ax = plt.subplots(figsize=(6, 6)) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") + +plt.text(3, 20, string_score) + +plt.title('Ridge model, small regularization, normalized variables') +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +_ = plt.ylim([0, 27]) + +############################################################################## +# For the coefficient analysis, scaling is not needed this time. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization, normalized variables') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################## +# We cross validate the coefficients. + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ + for est in cv_model['estimator']], + columns=feature_names +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) +plt.axvline(x=0, color='.5') +plt.title('Coefficient variability') +plt.subplots_adjust(left=.3) + +############################################################################## +# The result is quite similar to the non-normalized case. +# +# Linear models with regularization +# --------------------------------- +# +# In machine-learning practice, Ridge Regression is more often used with +# non-negligible regularization. +# Above, we limited this regularization to a very little amount. +# Regularization improves the conditioning of the problem and reduces the +# variance of the estimates. RidgeCV applies cross validation in order to +# determine which value of the regularization parameter (`alpha`) is best +# suited for the model estimation. + +from sklearn.linear_model import RidgeCV + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +_ = model.fit(X_train, y_train) + +############################################################################## +# First we verify which value of :math:`\alpha` has been selected. + +model[-1].regressor_.alpha_ + +############################################################################## +# Then we check the quality of the predictions. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = f'MAE on training set: {mae:.2f} $/hour' +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' + +fig, ax = plt.subplots(figsize=(6, 6)) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") + +plt.text(3, 20, string_score) + +plt.title('Ridge model, regularization, normalized variables') +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +_ = plt.ylim([0, 27]) + +############################################################################## +# The ability to reproduce the data of the regularized model is similar to +# the one of the non-regularized model. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, regularization, normalized variables') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################## +# The coefficients are significantly different. +# AGE and EXPERIENCE coefficients are both positive but they have less +# influence on the prediction. +# The regularization manages to lower the influence of correlated +# variables on the model because the weight is shared between the two +# predictive variables, so neither alone would be very strongly weighted. +# On the other hand, those weights are more robust with respect to +# cross validation (see the :ref:`ridge_regression` User Guide section), +# as is shown in the plot below to be compared with the +# :ref:`previous one`. + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std(axis=0) + for est in cv_model['estimator']], + columns=feature_names +) + +plt.ylabel('Age coefficient') +plt.xlabel('Experience coefficient') +plt.grid(True) +plt.xlim(-0.4, 0.5) +plt.ylim(-0.4, 0.5) +plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE ' + 'across folds') + +############################################################################## +# Linear models with sparse coefficients +# -------------------------------------- +# +# Another possibility to take into account correlated variables in the dataset, +# is to estimate sparse coefficients. In some way we already did it manually +# when we dropped the AGE column in a previous Ridge estimation. +# +# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse +# coefficients. LassoCV applies cross validation in order to +# determine which value of the regularization parameter (`alpha`) is best +# suited for the model estimation. + +from sklearn.linear_model import LassoCV + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=LassoCV(alphas=np.logspace(-10, 10, 21), max_iter=100000), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +_ = model.fit(X_train, y_train) + +############################################################################## +# First we verify which value of :math:`\alpha` has been selected. + +model[-1].regressor_.alpha_ + +############################################################################## +# Then we check the quality of the predictions. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = f'MAE on training set: {mae:.2f} $/hour' +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' + +fig, ax = plt.subplots(figsize=(6, 6)) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") + +plt.text(3, 20, string_score) + +plt.title('Lasso model, regularization, normalized variables') +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +_ = plt.ylim([0, 27]) + +############################################################################## +# For our dataset, again the model is not very predictive. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Lasso model, regularization, normalized variables') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################# +# A Lasso model identifies the correlation between +# AGE and EXPERIENCE and suppresses one of them for the sake of the prediction. +# +# Lessons learned +# --------------- +# +# * Feature importance could be extrapolated from the coefficients only after +# having scaled them to the same unit of measure. +# * Coefficients in multiple linear models represent conditional dependencies +# between a given feature and the target. +# * Correlated features induce variability in the coefficients of linear +# models. +# * Different linear models respond differently to feature correlation and +# coefficients could significantly vary from one another. From c464e92e704be6f4feaae0d81a4fd88c8d5a07e1 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Tue, 10 Mar 2020 17:01:51 -0600 Subject: [PATCH 087/103] MNT Remove unused imports (#16665) --- sklearn/calibration.py | 2 +- sklearn/cluster/_bicluster.py | 3 +-- sklearn/cluster/_dbscan.py | 1 - sklearn/cluster/_optics.py | 1 - sklearn/cluster/_spectral.py | 2 +- sklearn/covariance/_graph_lasso.py | 2 +- sklearn/datasets/_base.py | 1 - sklearn/decomposition/_kernel_pca.py | 3 +-- sklearn/decomposition/_lda.py | 3 +-- sklearn/discriminant_analysis.py | 2 +- sklearn/ensemble/_bagging.py | 2 +- sklearn/ensemble/_gb.py | 6 ------ .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 2 +- sklearn/ensemble/_weight_boosting.py | 2 +- sklearn/feature_selection/_rfe.py | 2 +- sklearn/feature_selection/_variance_threshold.py | 1 - sklearn/gaussian_process/_gpc.py | 2 +- sklearn/gaussian_process/_gpr.py | 2 +- sklearn/impute/_base.py | 1 - sklearn/inspection/_plot/partial_dependence.py | 2 +- sklearn/kernel_ridge.py | 1 - sklearn/linear_model/_base.py | 2 +- sklearn/linear_model/_bayes.py | 1 - sklearn/linear_model/_coordinate_descent.py | 2 +- sklearn/linear_model/_huber.py | 1 - sklearn/linear_model/_least_angle.py | 2 +- sklearn/linear_model/_logistic.py | 2 -- sklearn/linear_model/_omp.py | 2 +- sklearn/linear_model/_ridge.py | 1 - sklearn/linear_model/_theil_sen.py | 1 - sklearn/manifold/_t_sne.py | 1 - sklearn/metrics/cluster/_unsupervised.py | 1 - sklearn/mixture/_bayesian_mixture.py | 1 - sklearn/mixture/_gaussian_mixture.py | 1 - sklearn/neighbors/_nca.py | 3 +-- sklearn/neighbors/_nearest_centroid.py | 2 +- sklearn/neural_network/_multilayer_perceptron.py | 2 +- sklearn/preprocessing/_function_transformer.py | 1 - sklearn/semi_supervised/_label_propagation.py | 2 +- sklearn/svm/_base.py | 2 +- sklearn/svm/_classes.py | 2 -- sklearn/utils/deprecation.py | 1 - 42 files changed, 23 insertions(+), 53 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index a5490efa28c0a..8a719d49bd6de 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -21,7 +21,7 @@ from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone, MetaEstimatorMixin) from .preprocessing import label_binarize, LabelBinarizer -from .utils import check_X_y, check_array, indexable, column_or_1d +from .utils import check_array, indexable, column_or_1d from .utils.validation import check_is_fitted, check_consistent_length from .utils.validation import _check_sample_weight from .isotonic import IsotonicRegression diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index c98272d6aae33..bfc27a84c7b76 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -18,8 +18,7 @@ from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) -from ..utils.validation import (assert_all_finite, check_array, - _deprecate_positional_args) +from ..utils.validation import assert_all_finite, _deprecate_positional_args __all__ = ['SpectralCoclustering', diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 52c962052f9bc..74e0b84477ad6 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -14,7 +14,6 @@ from scipy import sparse from ..base import BaseEstimator, ClusterMixin -from ..utils import check_array from ..utils.validation import _check_sample_weight, _deprecate_positional_args from ..neighbors import NearestNeighbors diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 92322b0ab0bfd..c8ca3ec569a88 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -14,7 +14,6 @@ import warnings import numpy as np -from ..utils import check_array from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 2faddabefa157..3eac2b84b74fd 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -11,7 +11,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array -from ..utils.validation import check_array, _deprecate_positional_args +from ..utils.validation import _deprecate_positional_args from ..metrics.pairwise import pairwise_kernels from ..neighbors import kneighbors_graph, NearestNeighbors from ..manifold import spectral_embedding diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 77ff9adb7fc0c..39aa77db417b9 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -18,7 +18,7 @@ from . import empirical_covariance, EmpiricalCovariance, log_likelihood from ..exceptions import ConvergenceWarning -from ..utils.validation import check_random_state, check_array +from ..utils.validation import check_random_state from ..utils.validation import _deprecate_positional_args from ..linear_model import _cd_fast as cd_fast from ..linear_model import lars_path_gram diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 909470f980a5e..d481288133991 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -9,7 +9,6 @@ import os import csv import shutil -import warnings from collections import namedtuple from os import environ, listdir, makedirs from os.path import dirname, exists, expanduser, isdir, join, splitext diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 6f15ebc29f761..b80935b4d4359 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -9,8 +9,7 @@ from ..utils import check_random_state from ..utils.extmath import svd_flip -from ..utils.validation import (check_is_fitted, check_array, - _check_psd_eigenvalues) +from ..utils.validation import check_is_fitted, _check_psd_eigenvalues from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index ba68e03a16191..8ce3f05b7031c 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -17,8 +17,7 @@ from joblib import Parallel, delayed, effective_n_jobs from ..base import BaseEstimator, TransformerMixin -from ..utils import (check_random_state, check_array, - gen_batches, gen_even_slices) +from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.fixes import logsumexp from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 2bd3948f2e013..45e623904b9ea 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -18,7 +18,7 @@ from .linear_model._base import LinearClassifierMixin from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance from .utils.multiclass import unique_labels -from .utils import check_array, check_X_y +from .utils import check_array from .utils.validation import check_is_fitted from .utils.multiclass import check_classification_targets from .utils.extmath import softmax diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index d73f38954d21a..6a98d79bfac7e 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -16,7 +16,7 @@ from ..base import ClassifierMixin, RegressorMixin from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, check_X_y, check_array, column_or_1d +from ..utils import check_random_state, check_array, column_or_1d from ..utils import indices_to_mask from ..utils.metaestimators import if_delegate_has_method from ..utils.multiclass import check_classification_targets diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c9f0b69f57968..d55499c92bac4 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -40,22 +40,16 @@ from scipy.sparse import csc_matrix from scipy.sparse import csr_matrix from scipy.sparse import issparse -from scipy.special import expit from time import time from ..model_selection import train_test_split from ..tree import DecisionTreeRegressor from ..tree._tree import DTYPE, DOUBLE -from ..tree._tree import TREE_LEAF from . import _gb_losses from ..utils import check_random_state from ..utils import check_array from ..utils import column_or_1d -from ..utils import check_consistent_length -from ..utils import deprecated -from ..utils.fixes import logsumexp -from ..utils.stats import _weighted_percentile from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.multiclass import check_classification_targets from ..exceptions import NotFittedError diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e18d3ac4b1f9b..109c147c047d2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -8,7 +8,7 @@ from timeit import default_timer as time from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) -from ...utils import check_X_y, check_random_state, check_array, resample +from ...utils import check_random_state, check_array, resample from ...utils.validation import (check_is_fitted, check_consistent_length, _check_sample_weight) from ...utils.multiclass import check_classification_targets diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index de73858f4bb3f..8b1e05502deac 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -33,7 +33,7 @@ from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_array, check_random_state, check_X_y, _safe_indexing +from ..utils import check_array, check_random_state, _safe_indexing from ..utils.extmath import softmax from ..utils.extmath import stable_cumsum from ..metrics import accuracy_score, r2_score diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 69e3cc4de9e6c..ac4dbdf58d174 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -9,7 +9,7 @@ import numpy as np from joblib import Parallel, delayed, effective_n_jobs -from ..utils import check_X_y, safe_sqr +from ..utils import safe_sqr from ..utils.metaestimators import if_delegate_has_method from ..utils.metaestimators import _safe_split from ..utils.validation import check_is_fitted diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 6438e6b80dc0a..b0bd41ba41abd 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -4,7 +4,6 @@ import numpy as np from ..base import BaseEstimator from ._base import SelectorMixin -from ..utils import check_array from ..utils.sparsefuncs import mean_variance_axis, min_max_axis from ..utils.validation import check_is_fitted diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index ed8ed2a007a22..e70838c6d251a 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -14,7 +14,7 @@ from ..base import BaseEstimator, ClassifierMixin, clone from .kernels \ import RBF, CompoundKernel, ConstantKernel as C -from ..utils.validation import check_X_y, check_is_fitted, check_array +from ..utils.validation import check_is_fitted, check_array from ..utils import check_random_state from ..utils.optimize import _check_optimize_result from ..preprocessing import LabelEncoder diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 1b48efb39f26d..1b7094d048c77 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -15,7 +15,7 @@ from ..base import MultiOutputMixin from .kernels import RBF, ConstantKernel as C from ..utils import check_random_state -from ..utils.validation import check_X_y, check_array +from ..utils.validation import check_array from ..utils.optimize import _check_optimize_result diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index bc98778d5c5d8..608f8f54ee162 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -16,7 +16,6 @@ from ..utils.validation import FLOAT_DTYPES from ..utils._mask import _get_mask from ..utils import is_scalar_nan -from ..utils import check_array def _check_inputs_dtype(X, missing_values): diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 5dee2750ad37a..f39c604cac77b 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -9,7 +9,7 @@ from joblib import Parallel, delayed from .. import partial_dependence -from ...base import is_classifier, is_regressor +from ...base import is_regressor from ...utils import check_array from ...utils import check_matplotlib_support # noqa from ...utils import _safe_indexing diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 21c43979c3b1e..d08c706caefc4 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -9,7 +9,6 @@ from .base import BaseEstimator, RegressorMixin, MultiOutputMixin from .metrics.pairwise import pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel -from .utils import check_X_y from .utils.validation import check_is_fitted, _check_sample_weight diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index d280f9d0f0d81..8e91767b9ff53 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -26,7 +26,7 @@ from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin) -from ..utils import check_array, check_X_y +from ..utils import check_array from ..utils.validation import FLOAT_DTYPES from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index c67fc54f43157..397461e73d8be 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -12,7 +12,6 @@ from ._base import LinearModel, _rescale_data from ..base import RegressorMixin from ..utils.extmath import fast_logdet -from ..utils import check_X_y from ..utils.fixes import pinvh from ..utils.validation import _check_sample_weight diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 9281d03710455..ca1e8ccb48ca4 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -17,7 +17,7 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin from ._base import _preprocess_data -from ..utils import check_array, check_X_y +from ..utils import check_array from ..utils.validation import check_random_state from ..model_selection import check_cv from ..utils.extmath import safe_sparse_dot diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 1d3a3fcc73421..d9046d3a1ee9b 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -7,7 +7,6 @@ from ..base import BaseEstimator, RegressorMixin from ._base import LinearModel -from ..utils import check_X_y from ..utils import axis0_safe_slice from ..utils.validation import _check_sample_weight from ..utils.extmath import safe_sparse_dot diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 9f0f62471376a..81068bb40c725 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -19,7 +19,7 @@ from ._base import LinearModel from ..base import RegressorMixin, MultiOutputMixin -from ..utils import arrayfuncs, as_float_array, check_X_y +from ..utils import arrayfuncs, as_float_array from ..model_selection import check_cv from ..exceptions import ConvergenceWarning diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 9e84e56ee0284..5b3adcb002c5d 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -29,9 +29,7 @@ from ..utils.extmath import row_norms from ..utils.fixes import logsumexp from ..utils.optimize import _newton_cg, _check_optimize_result -from ..utils.validation import check_X_y from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils import deprecated from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args from ..model_selection import check_cv diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 54b751423c933..0d572dd17c6d7 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -15,7 +15,7 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin -from ..utils import as_float_array, check_array, check_X_y +from ..utils import as_float_array, check_array from ..model_selection import check_cv premature = """ Orthogonal matching pursuit ended prematurely due to linear diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index c40f641df4b5e..9c3f703ac478e 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -22,7 +22,6 @@ from ..base import RegressorMixin, MultiOutputMixin, is_classifier from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms -from ..utils import check_X_y from ..utils import check_array from ..utils import check_consistent_length from ..utils import compute_sample_weight diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index a29cc26cdc0a3..16f0adae12c9c 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -20,7 +20,6 @@ from ._base import LinearModel from ..base import RegressorMixin from ..utils import check_random_state -from ..utils import check_X_y from ..exceptions import ConvergenceWarning _EPSILON = np.finfo(np.double).eps diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index d0c9e4e509a73..3fd4d1b364b05 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -16,7 +16,6 @@ from scipy.sparse import csr_matrix, issparse from ..neighbors import NearestNeighbors from ..base import BaseEstimator -from ..utils import check_array from ..utils import check_random_state from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import check_non_negative diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py index a0eaa5e84240a..8841df701c69f 100644 --- a/sklearn/metrics/cluster/_unsupervised.py +++ b/sklearn/metrics/cluster/_unsupervised.py @@ -16,7 +16,6 @@ from ..pairwise import pairwise_distances_chunked from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder -from ...utils import deprecated def check_number_of_labels(n_labels, n_samples): diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index d69b7d1958183..c68fa260faee3 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -15,7 +15,6 @@ from ._gaussian_mixture import _estimate_gaussian_parameters from ._gaussian_mixture import _estimate_log_gaussian_prob from ..utils import check_array -from ..utils.validation import check_is_fitted def _log_dirichlet_norm(dirichlet_concentration): diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index 1c563984ba00b..277f65f929eac 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -10,7 +10,6 @@ from ._base import BaseMixture, _check_shape from ..utils import check_array -from ..utils.validation import check_is_fitted from ..utils.extmath import row_norms diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index b9d2de01c958d..cd87d594281da 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -22,8 +22,7 @@ from ..decomposition import PCA from ..utils.multiclass import check_classification_targets from ..utils.random import check_random_state -from ..utils.validation import (check_is_fitted, check_array, check_X_y, - check_scalar) +from ..utils.validation import check_is_fitted, check_array, check_scalar from ..exceptions import ConvergenceWarning diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 48712c1fcfb44..0fdcd597353f5 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -15,7 +15,7 @@ from ..base import BaseEstimator, ClassifierMixin from ..metrics.pairwise import pairwise_distances from ..preprocessing import LabelEncoder -from ..utils.validation import check_array, check_X_y, check_is_fitted +from ..utils.validation import check_array, check_is_fitted from ..utils.sparsefuncs import csc_median_axis_0 from ..utils.multiclass import check_classification_targets diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 6eb42bb455c3a..3ec30336c23c1 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -22,7 +22,7 @@ from ..utils import gen_batches, check_random_state from ..utils import shuffle from ..utils import _safe_indexing -from ..utils import check_array, check_X_y, column_or_1d +from ..utils import check_array, column_or_1d from ..exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot from ..utils.validation import check_is_fitted diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 85ce3a1f845c1..9cf365ebb3cdf 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,7 +1,6 @@ import warnings from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array from ..utils.validation import _allclose_dense_sparse diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index a84a9950aa3ac..d46dacbe754e4 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -67,7 +67,7 @@ from ..neighbors import NearestNeighbors from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_X_y, check_is_fitted, check_array +from ..utils.validation import check_is_fitted, check_array from ..exceptions import ConvergenceWarning diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 662a4ffa24678..d935de697fcf7 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -10,7 +10,7 @@ from ..preprocessing import LabelEncoder from ..utils.multiclass import _ovr_decision_function from ..utils import check_array, check_random_state -from ..utils import column_or_1d, check_X_y +from ..utils import column_or_1d from ..utils import compute_class_weight from ..utils.extmath import safe_sparse_dot from ..utils.validation import check_is_fitted, _check_large_sparse diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index fbaa6e97ec616..da5cfa437e476 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1,11 +1,9 @@ -import warnings import numpy as np from ._base import _fit_liblinear, BaseSVC, BaseLibSVM from ..base import BaseEstimator, RegressorMixin, OutlierMixin from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \ LinearModel -from ..utils import check_X_y from ..utils.validation import _num_samples from ..utils.multiclass import check_classification_targets from ..utils.deprecation import deprecated diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py index 7780cac7b52fb..e71aa57400ac1 100644 --- a/sklearn/utils/deprecation.py +++ b/sklearn/utils/deprecation.py @@ -1,6 +1,5 @@ import warnings import functools -import sys __all__ = ["deprecated"] From 72f39d9ec14ef64c89dceaeeb5335335a6aea5db Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 11 Mar 2020 04:33:02 -0400 Subject: [PATCH 088/103] MNT Restores behavior of conditioning on linting for most instances (#16672) --- azure-pipelines.yml | 4 ++++ build_tools/azure/posix.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1aad015849b2e..047988e25f648 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -80,6 +80,7 @@ jobs: name: Linux vmImage: ubuntu-18.04 dependsOn: [linting] + condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) matrix: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 @@ -123,6 +124,7 @@ jobs: name: Linux32 vmImage: ubuntu-18.04 dependsOn: [linting] + condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) matrix: py36_ubuntu_atlas_32bit: DISTRIB: 'ubuntu-32' @@ -135,6 +137,7 @@ jobs: name: macOS vmImage: macOS-10.14 dependsOn: [linting] + condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) matrix: pylatest_conda_mkl: DISTRIB: 'conda' @@ -168,6 +171,7 @@ jobs: name: Windows vmImage: vs2017-win2016 dependsOn: [linting] + condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) matrix: py37_conda_mkl: PYTHON_VERSION: '3.7' diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index 9efb0418278d2..c48e3644680bd 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -3,7 +3,7 @@ parameters: vmImage: '' matrix: [] dependsOn: [] - condition: ne(variables['Build.Reason'], 'Schedule') + condition: '' jobs: - job: ${{ parameters.name }} From 7cf9e1e62bdfff9b6b6b5b19bef62b2850e1d568 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 11 Mar 2020 04:39:25 -0400 Subject: [PATCH 089/103] BUG Fixes HistGradientBoosting when warm_start is on + early_stopping is on + no validation (#16663) --- .../_hist_gradient_boosting/gradient_boosting.py | 2 ++ .../tests/test_gradient_boosting.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 109c147c047d2..09906f7f4f215 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -304,6 +304,8 @@ def fit(self, X, y, sample_weight=None): raw_predictions = self._raw_predict(X_binned_train) if self.do_early_stopping_ and self._use_validation_data: raw_predictions_val = self._raw_predict(X_binned_val) + else: + raw_predictions_val = None if self.do_early_stopping_ and self.scoring != 'loss': # Compute the subsample set diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index c5b4a143591d6..13cd8eac1cb7d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -645,3 +645,17 @@ def test_max_depth_max_leaf_nodes(): tree = est._predictors[0][0] assert tree.get_max_depth() == 2 assert tree.get_n_leaf_nodes() == 3 # would be 4 prior to bug fix + + +def test_early_stopping_on_test_set_with_warm_start(): + # Non regression test for #16661 where second fit fails with + # warm_start=True, early_stopping is on, and no validation set + X, y = make_classification(random_state=0) + gb = HistGradientBoostingClassifier( + max_iter=1, scoring='loss', warm_start=True, early_stopping=True, + n_iter_no_change=1, validation_fraction=None) + + gb.fit(X, y) + # does not raise on second call + gb.set_params(max_iter=2) + gb.fit(X, y) From b2b4dbb61539bf7a2d114ad69dcaaa117da37ac5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 11 Mar 2020 09:55:14 +0100 Subject: [PATCH 090/103] BUG fix the math issue in latex compilation (#16673) --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 8fdad51a3ff7f..1f9ea111f9868 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -63,7 +63,7 @@ ############################################################################## # Our target for prediction: the wage. -# Wages are described as floating-point number in :math:`k$` +# Wages are described as floating-point number in dollars per hour. y = survey.target.values.ravel() survey.target.head() From ba3fcdbf5b2f20f6cc25e8bd618009f10ac6d37c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 11 Mar 2020 11:40:14 +0100 Subject: [PATCH 091/103] BUG remove $ math env due to latex error (#16674) --- ...linear_model_coefficient_interpretation.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 1f9ea111f9868..9dc8f823ae8bd 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -216,20 +216,18 @@ coefs ############################################################################## -# The AGE coefficient is expressed in -# :math:`$/hours/(living\ years)` while the EDUCATION one is expressed -# in :math:`$/hours/(years\ of\ education)`. -# This representation of the coefficients has the advantage of making clear -# the practical predictions of the model: -# an increase of :math:`1` year in AGE means a decrease of :math:`0.030867$`, -# while an increase of :math:`1` year in EDUCATION means an increase of -# :math:`0.054699$`. -# On the other hand, categorical variables (as UNION or SEX) are adimensional -# numbers taking the value either of 0 or 1. Their coefficients are expressed -# in :math:`$/hours`. Then, we cannot compare the magnitude of different -# coefficients since the features have different natural scales, and hence -# value ranges, because of their different unit of measure. -# This is more evident if we plot the coefficients. +# The AGE coefficient is expressed in "dollars/hour per living years" while the +# EDUCATION one is expressed in "dollars/hour per years of education". This +# representation of the coefficients has the advantage of making clear the +# practical predictions of the model: an increase of :math:`1` year in AGE +# means a decrease of :math:`0.030867` dollars/hour, while an increase of +# :math:`1` year in EDUCATION means an increase of :math:`0.054699` +# dollars/hour. On the other hand, categorical variables (as UNION or SEX) are +# adimensional numbers taking either the value 0 or 1. Their coefficients +# are expressed in dollars/hour. Then, we cannot compare the magnitude of +# different coefficients since the features have different natural scales, and +# hence value ranges, because of their different unit of measure. This is more +# evident if we plot the coefficients. coefs.plot(kind='barh', figsize=(9, 7)) plt.title('Ridge model, small regularization') From 864d0282bf74b07f0254d3a43e701e3177f3979b Mon Sep 17 00:00:00 2001 From: Katrina Ni Date: Wed, 11 Mar 2020 12:07:12 -0700 Subject: [PATCH 092/103] DOC add example to tree.ExtraTreeClassifier (#16671) Co-authored-by: Jonathan Rahn Co-authored-by: Katrina Ni --- sklearn/tree/_classes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 09481aefeed41..fe77610a20601 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1477,6 +1477,21 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", Machine Learning, 63(1), 3-42, 2006. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.ensemble import BaggingClassifier + >>> from sklearn.tree import ExtraTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> extra_tree = ExtraTreeClassifier(random_state=0) + >>> cls = BaggingClassifier(extra_tree, random_state=0).fit( + ... X_train, y_train) + >>> cls.score(X_test, y_test) + 0.8947... """ def __init__(self, criterion="gini", From dfdda8368465d44060d5aa831ee5b886615a3516 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 12 Mar 2020 11:38:37 +0100 Subject: [PATCH 093/103] PEP8 in test_encoders.py --- sklearn/preprocessing/tests/test_encoders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 7e23aa2d485c2..aefef7601cd4f 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -284,6 +284,7 @@ def test_one_hot_encoder_drop_reset(drop, reset_drop): assert_allclose(ohe.transform(X), X_tr) assert_array_equal(ohe.get_feature_names(), feature_names) + @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @pytest.mark.parametrize("X", [ [1, 2], From 3686d5550831424a6df1f32c242f9604537708e5 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 12 Mar 2020 06:58:53 -0400 Subject: [PATCH 094/103] MNT Removes unused private attributes (#16675) --- sklearn/isotonic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 896044ae9cc6e..96e82b7b6a318 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -252,12 +252,10 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True): unique_X, unique_y, unique_sample_weight = _make_unique( X, y, sample_weight) - # Store _X_ and _y_ to maintain backward compat during the deprecation - # period of X_ and y_ - self._X_ = X = unique_X - self._y_ = y = isotonic_regression(unique_y, unique_sample_weight, - self.y_min, self.y_max, - increasing=self.increasing_) + X = unique_X + y = isotonic_regression(unique_y, unique_sample_weight, + self.y_min, self.y_max, + increasing=self.increasing_) # Handle the left and right bounds on X self.X_min_, self.X_max_ = np.min(X), np.max(X) From 77fb39dbe499510fd878c7a61cd2a6caa99f7b55 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 12 Mar 2020 12:56:35 +0100 Subject: [PATCH 095/103] CI Check for unused imports when linting (#16678) * Check for missing imports in linting * Exclude externals * Remove unused imports --- build_tools/circle/linting.sh | 2 ++ sklearn/__init__.py | 1 - sklearn/cluster/tests/test_bicluster.py | 1 - sklearn/cross_decomposition/tests/test_pls.py | 1 - sklearn/datasets/tests/test_base.py | 1 - sklearn/ensemble/tests/test_gradient_boosting.py | 1 - sklearn/linear_model/tests/test_huber.py | 1 - sklearn/linear_model/tests/test_perceptron.py | 1 - sklearn/linear_model/tests/test_ransac.py | 1 - sklearn/metrics/cluster/tests/test_unsupervised.py | 1 - sklearn/tests/test_common.py | 2 +- sklearn/tests/test_discriminant_analysis.py | 4 +--- sklearn/tests/test_multiclass.py | 2 -- sklearn/tree/tests/test_tree.py | 1 - sklearn/utils/__init__.py | 1 + sklearn/utils/tests/test_estimator_checks.py | 2 +- 16 files changed, 6 insertions(+), 17 deletions(-) diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh index 2b408031c2eb6..dad7ad95ce7c1 100755 --- a/build_tools/circle/linting.sh +++ b/build_tools/circle/linting.sh @@ -141,6 +141,8 @@ else check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)" check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \ --config ./examples/.flake8 + # check code for unused imports + flake8 --exclude=sklearn/externals/ --select=F401 sklearn/ examples/ fi echo -e "No problem detected by flake8\n" diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 59aa672533524..cb8b37692618f 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -13,7 +13,6 @@ See http://scikit-learn.org for complete documentation. """ import sys -import re import logging import os diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 7d5a920600d7d..6e3e664c622a8 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -9,7 +9,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import SkipTest from sklearn.base import BaseEstimator, BiclusterMixin diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index 2d788a2cf6271..f47dcc8e8f22f 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -1,4 +1,3 @@ -import pytest import numpy as np from numpy.testing import assert_approx_equal diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index a58bdc9ed644d..224538b181696 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -8,7 +8,6 @@ from functools import partial import pytest -import joblib import numpy as np from sklearn.datasets import get_data_home diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 0c7f07929e370..c7653ddac959c 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -13,7 +13,6 @@ from sklearn import datasets from sklearn.base import clone -from sklearn.base import BaseEstimator from sklearn.datasets import (make_classification, fetch_california_housing, make_regression) from sklearn.ensemble import GradientBoostingClassifier diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index cb70db88d3d41..156cd4b57dbc8 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -3,7 +3,6 @@ import numpy as np from scipy import optimize, sparse -import pytest from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal diff --git a/sklearn/linear_model/tests/test_perceptron.py b/sklearn/linear_model/tests/test_perceptron.py index 6cdd538ca9247..1fe74164f17f4 100644 --- a/sklearn/linear_model/tests/test_perceptron.py +++ b/sklearn/linear_model/tests/test_perceptron.py @@ -1,6 +1,5 @@ import numpy as np import scipy.sparse as sp -import pytest from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_raises diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index f52e4f0852d5f..1f7d3c2569bab 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -1,4 +1,3 @@ -import pytest import numpy as np from scipy import sparse diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index f169a9242daf0..354b6c94a7548 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -5,7 +5,6 @@ from sklearn import datasets from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns_message from sklearn.metrics.cluster import silhouette_score from sklearn.metrics.cluster import silhouette_samples from sklearn.metrics import pairwise_distances diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index d769bb630bd03..92cbed36044bf 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -20,7 +20,7 @@ from sklearn.utils import all_estimators from sklearn.utils._testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.estimator_checks import check_estimator, _safe_tags +from sklearn.utils.estimator_checks import check_estimator import sklearn from sklearn.base import BiclusterMixin diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index dcd4009a47a2d..029ba8471ed1f 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -4,10 +4,8 @@ from scipy import linalg -from sklearn.exceptions import ChangedBehaviorWarning from sklearn.utils import check_random_state -from sklearn.utils._testing import (assert_array_equal, assert_no_warnings, - assert_warns_message) +from sklearn.utils._testing import assert_array_equal, assert_no_warnings from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 33eb5da939725..03ada399d2af2 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -1,5 +1,3 @@ -import pytest - import numpy as np import scipy.sparse as sp diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1149ceb8678d9..071e7efd49177 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -25,7 +25,6 @@ from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import TempMemmap from sklearn.utils.validation import check_random_state diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4b69365339389..aac6e292a198a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -51,6 +51,7 @@ "check_symmetric", "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", + "DataConversionWarning" ] IS_PYPY = platform.python_implementation() == 'PyPy' diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 748666884e60e..a7f4911791467 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -35,7 +35,7 @@ from sklearn.svm import SVC from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier -from sklearn.utils.validation import check_X_y, check_array +from sklearn.utils.validation import check_array from sklearn.utils import all_estimators From e087ea7f9cd33fb380d6d6792ba9cfe82ecb14df Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Fri, 13 Mar 2020 04:52:17 -0400 Subject: [PATCH 096/103] DOC wording in linear model interpretation example (#16680) Co-authored-by: Guillaume Lemaitre Co-authored-by: Chiara Marmo Co-authored-by: Nicolas Hug --- ...linear_model_coefficient_interpretation.py | 108 +++++++++++------- 1 file changed, 67 insertions(+), 41 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 9dc8f823ae8bd..7583bfa0a052f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -3,9 +3,9 @@ Common pitfalls in interpretation of coefficients of linear models ================================================================== -Linear models describe situations in which the target value is expected to be +In linear models, the target value is modeled as a linear combination of the features (see the :ref:`linear_model` User Guide -section for a description of a set of linear model methods available in +section for a description of a set of linear models available in scikit-learn). Coefficients in multiple linear models represent the relationship between the given feature, :math:`X_i` and the target, :math:`y`, assuming that all the @@ -56,8 +56,9 @@ X.describe(include="all") ############################################################################## -# Notice that the dataset contains categorical and numerical variables. -# This will give us directions on how to preprocess the data thereafter. +# Note that the dataset contains categorical and numerical variables. +# We will need to take this into account when preprocessing the dataset +# thereafter. X.head() @@ -92,21 +93,24 @@ _ = sns.pairplot(train_dataset, kind='reg', diag_kind='kde') ############################################################################## -# Looking closely at the WAGE distribution it could be noticed that it has a -# long tail and we could take its logarithm -# to simplify our problem and approximate a normal distribution. +# Looking closely at the WAGE distribution reveals that it has a +# long tail. For this reason, we should take its logarithm +# to turn it approximately into a normal distribution (linear models such +# as ridge or lasso work best for a normal distribution of error). +# # The WAGE is increasing when EDUCATION is increasing. -# It should be noted that the dependence between WAGE and EDUCATION -# represented here is a marginal dependence, i.e., it describe the behavior -# of a specific variable without fixing the others. -# Also, the EXPERIENCE and AGE are linearly correlated. +# Note that the dependence between WAGE and EDUCATION +# represented here is a marginal dependence, i.e., it describes the behavior +# of a specific variable without keeping the others fixed. +# +# Also, the EXPERIENCE and AGE are strongly linearly correlated. # # .. _the-pipeline: # # The machine-learning pipeline # ----------------------------- # -# To design our machine-learning pipeline, we manually +# To design our machine-learning pipeline, we first manually # check the type of data that we are dealing with: survey.data.info() @@ -137,7 +141,7 @@ ) ############################################################################## -# To describe the dataset as a linear model we choose to use a ridge regressor +# To describe the dataset as a linear model we use a ridge regressor # with a very small regularization and to model the logarithm of the WAGE. @@ -190,11 +194,11 @@ # The model learnt is far from being a good model making accurate predictions: # this is obvious when looking at the plot above, where good predictions # should lie on the red line. +# # In the following section, we will interpret the coefficients of the model. -# While we do so, we should keep in mind that any conclusion we way draw will -# be about -# the model that we build, rather than about the true (real-world) generative -# process of the data. +# While we do so, we should keep in mind that any conclusion we draw is +# about the model that we build, rather than about the true (real-world) +# generative process of the data. # # Interpreting coefficients: scale matters # --------------------------------------------- @@ -218,7 +222,7 @@ ############################################################################## # The AGE coefficient is expressed in "dollars/hour per living years" while the # EDUCATION one is expressed in "dollars/hour per years of education". This -# representation of the coefficients has the advantage of making clear the +# representation of the coefficients has the benefit of making clear the # practical predictions of the model: an increase of :math:`1` year in AGE # means a decrease of :math:`0.030867` dollars/hour, while an increase of # :math:`1` year in EDUCATION means an increase of :math:`0.054699` @@ -227,7 +231,7 @@ # are expressed in dollars/hour. Then, we cannot compare the magnitude of # different coefficients since the features have different natural scales, and # hence value ranges, because of their different unit of measure. This is more -# evident if we plot the coefficients. +# visible if we plot the coefficients. coefs.plot(kind='barh', figsize=(9, 7)) plt.title('Ridge model, small regularization') @@ -237,12 +241,15 @@ ############################################################################### # Indeed, from the plot above the most important factor in determining WAGE # appears to be the -# variable UNION, even if it is plausible that variables like EXPERIENCE -# should have more impact. -# Looking at the coefficient plot to extrapolate feature importance could be +# variable UNION, even if our intuition might tell us that variables +# like EXPERIENCE should have more impact. +# +# Looking at the coefficient plot to gauge feature importance can be # misleading as some of them vary on a small scale, while others, like AGE, # varies a lot more, several decades. -# This is evident if we compare feature standard deviations. +# +# This is visible if we compare the standard deviations of different +# features. X_train_preprocessed = pd.DataFrame( model.named_steps['columntransformer'].transform(X_train), @@ -296,8 +303,11 @@ # Checking the variability of the coefficients # -------------------------------------------- # -# We can check the coefficient variability through cross-validation. -# If coefficients vary in a significant way changing the input dataset +# We can check the coefficient variability through cross-validation: +# it is a form of data perturbation (related to +# `resampling `_). +# +# If coefficients vary significantly when changing the input dataset # their robustness is not guaranteed, and they should probably be interpreted # with caution. @@ -330,6 +340,7 @@ # might be due to the collinearity between the 2 features: as AGE and # EXPERIENCE vary together in the data, their effect is difficult to tease # apart. +# # To verify this interpretation we plot the variability of the AGE and # EXPERIENCE coefficient. # @@ -446,7 +457,7 @@ plt.subplots_adjust(left=.3) ############################################################################## -# We cross validate the coefficients. +# We now inspect the coefficients across several cross-validation folds. cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), @@ -472,11 +483,12 @@ # # In machine-learning practice, Ridge Regression is more often used with # non-negligible regularization. +# # Above, we limited this regularization to a very little amount. # Regularization improves the conditioning of the problem and reduces the # variance of the estimates. RidgeCV applies cross validation in order to # determine which value of the regularization parameter (`alpha`) is best -# suited for the model estimation. +# suited for prediction. from sklearn.linear_model import RidgeCV @@ -492,7 +504,7 @@ _ = model.fit(X_train, y_train) ############################################################################## -# First we verify which value of :math:`\alpha` has been selected. +# First we check which value of :math:`\alpha` has been selected. model[-1].regressor_.alpha_ @@ -533,15 +545,18 @@ ############################################################################## # The coefficients are significantly different. -# AGE and EXPERIENCE coefficients are both positive but they have less +# AGE and EXPERIENCE coefficients are both positive but they now have less # influence on the prediction. -# The regularization manages to lower the influence of correlated +# +# The regularization reduces the influence of correlated # variables on the model because the weight is shared between the two -# predictive variables, so neither alone would be very strongly weighted. -# On the other hand, those weights are more robust with respect to -# cross validation (see the :ref:`ridge_regression` User Guide section), -# as is shown in the plot below to be compared with the -# :ref:`previous one`. +# predictive variables, so neither alone would have strong weights. +# +# On the other hand, the weights obtained with regularization are more +# stable (see the :ref:`ridge_regression` User Guide section). This +# increased stability is visible from the plot, obtained from data +# perturbations, in a cross validation. This plot can be compared with +# the :ref:`previous one`. cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), @@ -632,14 +647,25 @@ # A Lasso model identifies the correlation between # AGE and EXPERIENCE and suppresses one of them for the sake of the prediction. # +# It is important to keep in mind that the coefficients that have been +# dropped may still be related to the outcome by themselves: the model +# chose to suppress them because they bring little or no additional +# information on top of the other features. Additionnaly, this selection +# is unstable for correlated features, and should be interpreted with +# caution. +# # Lessons learned # --------------- # -# * Feature importance could be extrapolated from the coefficients only after -# having scaled them to the same unit of measure. -# * Coefficients in multiple linear models represent conditional dependencies -# between a given feature and the target. -# * Correlated features induce variability in the coefficients of linear -# models. +# * Coefficients must be scaled to the same unit of measure to retrieve +# feature importance. Scaling them with the standard-deviation of the +# feature is a useful proxy. +# * Coefficients in multivariate linear models represent the dependency +# between a given feature and the target, **conditional** on the other +# features. +# * Correlated features induce instabilities in the coefficients of linear +# models and their effects cannot be well teased apart. # * Different linear models respond differently to feature correlation and # coefficients could significantly vary from one another. +# * Inspecting coefficients across the folds of a cross-validation loop +# gives an idea of their stability. From 140dae425a0d877340969187e6f39a1ac9e65655 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 13 Mar 2020 09:54:57 +0100 Subject: [PATCH 097/103] API make __init__ params in cross_decomposition kw-only (#16682) --- sklearn/cross_decomposition/_cca.py | 4 +++- sklearn/cross_decomposition/_pls.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/cross_decomposition/_cca.py b/sklearn/cross_decomposition/_cca.py index bd2e933339228..9d60b59cd07f1 100644 --- a/sklearn/cross_decomposition/_cca.py +++ b/sklearn/cross_decomposition/_cca.py @@ -1,5 +1,6 @@ from ._pls import _PLS from ..base import _UnstableArchMixin +from ..utils.validation import _deprecate_positional_args __all__ = ['CCA'] @@ -102,7 +103,8 @@ class CCA(_UnstableArchMixin, _PLS): PLSSVD """ - def __init__(self, n_components=2, scale=True, + @_deprecate_positional_args + def __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True): super().__init__(n_components=n_components, scale=scale, deflation_mode="canonical", mode="B", diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 88951d18468d8..508448c3ede39 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -17,6 +17,7 @@ from ..utils import check_array, check_consistent_length from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD'] @@ -248,7 +249,8 @@ class _PLS(TransformerMixin, RegressorMixin, MultiOutputMixin, BaseEstimator, """ @abstractmethod - def __init__(self, n_components=2, scale=True, deflation_mode="regression", + def __init__(self, n_components=2, *, scale=True, + deflation_mode="regression", mode="A", algorithm="nipals", norm_y_weights=False, max_iter=500, tol=1e-06, copy=True): self.n_components = n_components @@ -650,8 +652,8 @@ class PLSRegression(_PLS): Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic. """ - - def __init__(self, n_components=2, scale=True, + @_deprecate_positional_args + def __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True): super().__init__( n_components=n_components, scale=scale, @@ -799,8 +801,8 @@ class PLSCanonical(_PLS): CCA PLSSVD """ - - def __init__(self, n_components=2, scale=True, algorithm="nipals", + @_deprecate_positional_args + def __init__(self, n_components=2, *, scale=True, algorithm="nipals", max_iter=500, tol=1e-06, copy=True): super().__init__( n_components=n_components, scale=scale, @@ -868,8 +870,8 @@ class PLSSVD(TransformerMixin, BaseEstimator): PLSCanonical CCA """ - - def __init__(self, n_components=2, scale=True, copy=True): + @_deprecate_positional_args + def __init__(self, n_components=2, *, scale=True, copy=True): self.n_components = n_components self.scale = scale self.copy = copy From dd437aac940e6d78068e5bbe3ce0d9d9bdda6b94 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Fri, 13 Mar 2020 15:24:09 +0100 Subject: [PATCH 098/103] DOC Adds example to OAS (#16681) --- sklearn/covariance/_shrunk_covariance.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 06e1b4f180347..fcc13a84e803e 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -534,6 +534,27 @@ class OAS(EmpiricalCovariance): coefficient in the convex combination used for the computation of the shrunk estimate. Range is [0, 1]. + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import OAS + >>> from sklearn.datasets import make_gaussian_quantiles + >>> real_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=500) + >>> oas = OAS().fit(X) + >>> oas.covariance_ + array([[0.7533..., 0.2763...], + [0.2763..., 0.3964...]]) + >>> oas.precision_ + array([[ 1.7833..., -1.2431... ], + [-1.2431..., 3.3889...]]) + >>> oas.shrinkage_ + 0.0195... + Notes ----- The regularised covariance is: From 7ec9c618214db1f93d0acf0614ee3d7d5a2989c3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 13 Mar 2020 15:25:09 +0100 Subject: [PATCH 099/103] =?UTF-8?q?DOC=20Add=20note=20on=20bias=20induced?= =?UTF-8?q?=20by=20dropping=20categories=20in=20OneHotE=E2=80=A6=20(#16679?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sklearn/preprocessing/_encoders.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 86be9d335bd9e..ab66847854f33 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -193,6 +193,10 @@ class OneHotEncoder(_BaseEncoder): features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression. + However, dropping one category breaks the symmetry of the original + representation and can therefore induce a bias in downstream models, + for instance for penalized linear classification or regression models. + - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. From d3e7041a8674a68ef6cd3d7ce5b8d1646686a830 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 13 Mar 2020 15:38:54 +0100 Subject: [PATCH 100/103] API make __init__ params in compose module kw-only (#16542) * API make __init__ params in compose module kw-only * pep8 * move * to after --- sklearn/compose/_column_transformer.py | 9 ++++++--- sklearn/compose/_target.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index e94757bca6993..903c63a00fd22 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -23,6 +23,7 @@ from ..utils import _determine_key_type from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted +from ..utils.validation import _deprecate_positional_args __all__ = [ @@ -171,8 +172,9 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): """ _required_parameters = ['transformers'] + @_deprecate_positional_args def __init__(self, - transformers, + transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, @@ -826,8 +828,9 @@ class make_column_selector: [-0.30151134, 0. , 1. , 0. ], [ 0.90453403, 0. , 0. , 1. ]]) """ - - def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None): + @_deprecate_positional_args + def __init__(self, pattern=None, *, dtype_include=None, + dtype_exclude=None): self.pattern = pattern self.dtype_include = dtype_include self.dtype_exclude = dtype_exclude diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 27f4ef63edf68..d8c062ed423a2 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -10,6 +10,7 @@ from ..utils.validation import check_is_fitted from ..utils import check_array, _safe_indexing from ..preprocessing import FunctionTransformer +from ..utils.validation import _deprecate_positional_args from ..exceptions import NotFittedError __all__ = ['TransformedTargetRegressor'] @@ -106,7 +107,8 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): `. """ - def __init__(self, regressor=None, transformer=None, + @_deprecate_positional_args + def __init__(self, regressor=None, *, transformer=None, func=None, inverse_func=None, check_inverse=True): self.regressor = regressor self.transformer = transformer From 9b1cdc938cd7e5ff6d9f8210dcf45bed3346d0ba Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Thu, 19 Mar 2020 15:48:14 +0530 Subject: [PATCH 101/103] Removed x100 from MAPE and modified tests too --- doc/modules/model_evaluation.rst | 2 +- doc/whats_new/_contributors.rst | 2 -- sklearn/metrics/_regression.py | 15 +++++++++------ sklearn/metrics/tests/test_regression.py | 6 +++--- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 52c726e909139..ddd0e38eb3795 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1991,7 +1991,7 @@ function:: >>> y_true = [1, 10, 1e6] >>> y_pred = [0.9, 15, 1.2e6] >>> mean_absolute_percentage_error(y_true, y_pred) - 26.66... + 0.2666... In above example, if we had used `mean_absolute_error`, it would have ignored the small magnitude values and only reflected the error in prediction of highest diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index a56e75ab176b0..ca0f8ede93afa 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -176,6 +176,4 @@ .. _Nicolas Hug: https://github.com/NicolasHug -.. _Ashutosh Hathidara: https://github.com/ashutosh1919 - .. _Guillaume Lemaitre: https://github.com/glemaitre \ No newline at end of file diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 08f93d8f9a3b5..449828649b774 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -195,7 +195,9 @@ def mean_absolute_percentage_error(y_true, y_pred, multioutput='uniform_average'): """Mean absolute percentage error regression loss - Read more in the :ref:`User Guide `. + Note here that we do not represent the output as a percentage in range + [0, 100]. Instead, we represent it in range [0, 1]. Read more in the + :ref:`User Guide `. Parameters ---------- @@ -221,7 +223,7 @@ def mean_absolute_percentage_error(y_true, y_pred, Returns ------- - loss : float or ndarray of floats + loss : float or ndarray of floats in the range [0, 1] If multioutput is 'raw_values', then mean absolute percentage error is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the @@ -230,6 +232,7 @@ def mean_absolute_percentage_error(y_true, y_pred, MAPE output is non-negative floating point. The best value is 0.0. But note the fact that bad predictions can lead to arbitarily large MAPE values, especially if some y_true values are very close to zero. + Note that we return a large value instead of `inf` when y_true is zero. Examples -------- @@ -237,13 +240,13 @@ def mean_absolute_percentage_error(y_true, y_pred, >>> y_true = [3, -0.5, 2, 7] >>> y_pred = [2.5, 0.0, 2, 8] >>> mean_absolute_percentage_error(y_true, y_pred) - 32.73... + 0.3273... >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> mean_absolute_percentage_error(y_true, y_pred) - 55.15... + 0.5515... >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) - 61.98... + 0.6198... """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) @@ -251,7 +254,7 @@ def mean_absolute_percentage_error(y_true, y_pred, epsilon = np.finfo(np.float64).eps mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) output_errors = np.average(mape, - weights=sample_weight, axis=0) * 100.0 + weights=sample_weight, axis=0) if isinstance(multioutput, str): if multioutput == 'raw_values': return output_errors diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 0e89f66d0a5ed..30bb92b48927b 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -214,7 +214,7 @@ def test_regression_multioutput_array(): assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) - assert_array_almost_equal(mape, [7.78, 22.62], decimal=2) + assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) @@ -274,7 +274,7 @@ def test_regression_custom_weights(): assert_almost_equal(msew, 0.39, decimal=2) assert_almost_equal(rmsew, 0.62, decimal=2) assert_almost_equal(maew, 0.475, decimal=3) - assert_almost_equal(mapew, 16.68, decimal=2) + assert_almost_equal(mapew, 0.1668, decimal=2) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) @@ -329,4 +329,4 @@ def test_mean_absolute_percentage_error(): random_number_generator = np.random.RandomState(42) y_true = random_number_generator.exponential(size=100) y_pred = 1.2 * y_true - assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(20.) + assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2) From 297ce25ddc9262e4ae6ffc84c41ba69b2ab564c1 Mon Sep 17 00:00:00 2001 From: Ashutosh Hathidara Date: Wed, 1 Apr 2020 18:48:43 +0530 Subject: [PATCH 102/103] Changed range --- sklearn/metrics/_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 449828649b774..0c5ff60e7e886 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -196,7 +196,7 @@ def mean_absolute_percentage_error(y_true, y_pred, """Mean absolute percentage error regression loss Note here that we do not represent the output as a percentage in range - [0, 100]. Instead, we represent it in range [0, 1]. Read more in the + [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the :ref:`User Guide `. Parameters @@ -223,7 +223,7 @@ def mean_absolute_percentage_error(y_true, y_pred, Returns ------- - loss : float or ndarray of floats in the range [0, 1] + loss : float or ndarray of floats in the range [0, 1/eps] If multioutput is 'raw_values', then mean absolute percentage error is returned for each output separately. If multioutput is 'uniform_average' or an ndarray of weights, then the From 7b2c86f3d9dd4dc6276c79ed8318180780ee0b3b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 4 Jul 2020 11:54:27 +0200 Subject: [PATCH 103/103] DOC Add absolute_percentage_error to doc/modules/model_evaluation.rst --- doc/modules/model_evaluation.rst | 81 ++++++++++++++++---------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index ba29d443745b1..bb8b59889a3f5 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -54,51 +54,52 @@ the model and the data, like :func:`metrics.mean_squared_error`, are available as neg_mean_squared_error which return the negated value of the metric. -============================== ============================================= ================================== -Scoring Function Comment -============================== ============================================= ================================== +==================================== ============================================== ================================== +Scoring Function Comment +==================================== ============================================== ================================== **Classification** -'accuracy' :func:`metrics.accuracy_score` -'balanced_accuracy' :func:`metrics.balanced_accuracy_score` -'average_precision' :func:`metrics.average_precision_score` -'neg_brier_score' :func:`metrics.brier_score_loss` -'f1' :func:`metrics.f1_score` for binary targets -'f1_micro' :func:`metrics.f1_score` micro-averaged -'f1_macro' :func:`metrics.f1_score` macro-averaged -'f1_weighted' :func:`metrics.f1_score` weighted average -'f1_samples' :func:`metrics.f1_score` by multilabel sample -'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support -'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' -'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' -'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' -'roc_auc' :func:`metrics.roc_auc_score` -'roc_auc_ovr' :func:`metrics.roc_auc_score` -'roc_auc_ovo' :func:`metrics.roc_auc_score` -'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` -'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` +'accuracy' :func:`metrics.accuracy_score` +'balanced_accuracy' :func:`metrics.balanced_accuracy_score` +'average_precision' :func:`metrics.average_precision_score` +'neg_brier_score' :func:`metrics.brier_score_loss` +'f1' :func:`metrics.f1_score` for binary targets +'f1_micro' :func:`metrics.f1_score` micro-averaged +'f1_macro' :func:`metrics.f1_score` macro-averaged +'f1_weighted' :func:`metrics.f1_score` weighted average +'f1_samples' :func:`metrics.f1_score` by multilabel sample +'neg_log_loss' :func:`metrics.log_loss` requires ``predict_proba`` support +'precision' etc. :func:`metrics.precision_score` suffixes apply as with 'f1' +'recall' etc. :func:`metrics.recall_score` suffixes apply as with 'f1' +'jaccard' etc. :func:`metrics.jaccard_score` suffixes apply as with 'f1' +'roc_auc' :func:`metrics.roc_auc_score` +'roc_auc_ovr' :func:`metrics.roc_auc_score` +'roc_auc_ovo' :func:`metrics.roc_auc_score` +'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` +'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` **Clustering** -'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` -'adjusted_rand_score' :func:`metrics.adjusted_rand_score` -'completeness_score' :func:`metrics.completeness_score` -'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` -'homogeneity_score' :func:`metrics.homogeneity_score` -'mutual_info_score' :func:`metrics.mutual_info_score` -'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` -'v_measure_score' :func:`metrics.v_measure_score` +'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` +'adjusted_rand_score' :func:`metrics.adjusted_rand_score` +'completeness_score' :func:`metrics.completeness_score` +'fowlkes_mallows_score' :func:`metrics.fowlkes_mallows_score` +'homogeneity_score' :func:`metrics.homogeneity_score` +'mutual_info_score' :func:`metrics.mutual_info_score` +'normalized_mutual_info_score' :func:`metrics.normalized_mutual_info_score` +'v_measure_score' :func:`metrics.v_measure_score` **Regression** -'explained_variance' :func:`metrics.explained_variance_score` -'max_error' :func:`metrics.max_error` -'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` -'neg_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` -'neg_median_absolute_error' :func:`metrics.median_absolute_error` -'r2' :func:`metrics.r2_score` -'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` -'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` -============================== ============================================= ================================== +'explained_variance' :func:`metrics.explained_variance_score` +'max_error' :func:`metrics.max_error` +'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` +'neg_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` +'neg_median_absolute_error' :func:`metrics.median_absolute_error` +'r2' :func:`metrics.r2_score` +'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` +'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` +'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` +==================================== ============================================== ================================== Usage examples:
  6. March 2020. scikit-learn 0.22.2 is available for download (Changelog).