Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f1f6a3c

Browse files
committed
Refactor sample_weights as generic scorer_params
1 parent 7612a1f commit f1f6a3c

File tree

7 files changed

+110
-101
lines changed

7 files changed

+110
-101
lines changed

sklearn/cross_validation.py

+32-35
Original file line numberDiff line numberDiff line change
@@ -1078,7 +1078,7 @@ def __len__(self):
10781078

10791079
def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
10801080
verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
1081-
sample_weight=None):
1081+
scorer_params=None):
10821082
"""Evaluate a score by cross-validation
10831083
10841084
Parameters
@@ -1093,9 +1093,6 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
10931093
The target variable to try to predict in the case of
10941094
supervised learning.
10951095
1096-
sample_weight : array-like, optional, default: None
1097-
Sample weights.
1098-
10991096
scoring : string, callable or None, optional, default: None
11001097
A string (see model evaluation documentation) or
11011098
a scorer callable object / function with signature
@@ -1134,12 +1131,16 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
11341131
- A string, giving an expression as a function of n_jobs,
11351132
as in '2*n_jobs'
11361133
1134+
scorer_params : dict, optional
1135+
Parameters to pass to the scorer. Can be used for sample weights
1136+
and sample groups.
1137+
11371138
Returns
11381139
-------
11391140
scores : array of float, shape=(len(list(cv)),)
11401141
Array of scores of the estimator for each run of the cross validation.
11411142
"""
1142-
X, y, sample_weight = indexable(X, y, sample_weight)
1143+
X, y = indexable(X, y)
11431144

11441145
cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
11451146
scorer = check_scoring(estimator, scoring=scoring)
@@ -1148,16 +1149,14 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
11481149
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
11491150
pre_dispatch=pre_dispatch)
11501151
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y,
1151-
sample_weight, scorer,
1152-
train, test, verbose, None,
1153-
fit_params)
1152+
scorer, train, test, verbose,
1153+
None, fit_params, scorer_params)
11541154
for train, test in cv)
11551155
return np.array(scores)[:, 0]
11561156

11571157

1158-
def _fit_and_score(estimator, X, y, sample_weight,
1159-
scorer, train, test, verbose, parameters,
1160-
fit_params, return_train_score=False,
1158+
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
1159+
fit_params, scorer_params, return_train_score=False,
11611160
return_parameters=False):
11621161
"""Fit estimator and compute scores for a given dataset split.
11631162
@@ -1173,9 +1172,6 @@ def _fit_and_score(estimator, X, y, sample_weight,
11731172
The target variable to try to predict in the case of
11741173
supervised learning.
11751174
1176-
sample_weight : array-like or None
1177-
Sample weights.
1178-
11791175
scoring : callable
11801176
A scorer callable object / function with signature
11811177
``scorer(estimator, X, y)``.
@@ -1195,6 +1191,9 @@ def _fit_and_score(estimator, X, y, sample_weight,
11951191
fit_params : dict or None
11961192
Parameters that will be passed to ``estimator.fit``.
11971193
1194+
scorer_params : dict or None
1195+
Parameters that will be passed to the scorer.
1196+
11981197
return_train_score : boolean, optional, default: False
11991198
Compute and return score on training set.
12001199
@@ -1233,33 +1232,36 @@ def _fit_and_score(estimator, X, y, sample_weight,
12331232
if hasattr(v, '__len__') and len(v) == n_samples else v)
12341233
for k, v in fit_params.items()])
12351234

1235+
# Same, but take both slices
1236+
scorer_params = scorer_params if scorer_params is not None else {}
1237+
train_scorer_params = dict([(k, np.asarray(v)[train]
1238+
if hasattr(v, '__len__')
1239+
and len(v) == n_samples
1240+
else v)
1241+
for k, v in scorer_params.items()])
1242+
test_scorer_params = dict([(k, np.asarray(v)[test]
1243+
if hasattr(v, '__len__')
1244+
and len(v) == n_samples
1245+
else v)
1246+
for k, v in scorer_params.items()])
1247+
12361248
if parameters is not None:
12371249
estimator.set_params(**parameters)
12381250

12391251
start_time = time.time()
12401252

1241-
X_train, y_train, sample_weight_train = _safe_split(
1242-
estimator, X, y, sample_weight, train)
1243-
X_test, y_test, sample_weight_test = _safe_split(
1244-
estimator, X, y, sample_weight, test, train)
1245-
1246-
test_score_params = {}
1247-
train_score_params = {}
1248-
if sample_weight is not None:
1249-
fit_params = fit_params.copy()
1250-
fit_params['sample_weight'] = sample_weight_train
1251-
test_score_params['sample_weight'] = sample_weight_test
1252-
train_score_params['sample_weight'] = sample_weight_train
1253+
X_train, y_train = _safe_split(estimator, X, y, train)
1254+
X_test, y_test = _safe_split(estimator, X, y, test, train)
12531255

12541256
if y_train is None:
12551257
estimator.fit(X_train, **fit_params)
12561258
else:
12571259
estimator.fit(X_train, y_train, **fit_params)
12581260
test_score = _score(estimator, X_test, y_test, scorer,
1259-
**test_score_params)
1261+
**test_scorer_params)
12601262
if return_train_score:
12611263
train_score = _score(estimator, X_train, y_train, scorer,
1262-
**train_score_params)
1264+
**train_scorer_params)
12631265

12641266
scoring_time = time.time() - start_time
12651267

@@ -1276,7 +1278,7 @@ def _fit_and_score(estimator, X, y, sample_weight,
12761278
return ret
12771279

12781280

1279-
def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None):
1281+
def _safe_split(estimator, X, y, indices, train_indices=None):
12801282
"""Create subset of dataset and properly handle kernels."""
12811283
if hasattr(estimator, 'kernel') and callable(estimator.kernel):
12821284
# cannot compute the kernel values with custom function
@@ -1305,12 +1307,7 @@ def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None):
13051307
else:
13061308
y_subset = None
13071309

1308-
if sample_weight is not None:
1309-
sample_weight_subset = np.asarray(sample_weight)[indices]
1310-
else:
1311-
sample_weight_subset = None
1312-
1313-
return X_subset, y_subset, sample_weight_subset
1310+
return X_subset, y_subset
13141311

13151312

13161313
def _score(estimator, X_test, y_test, scorer, **params):

sklearn/feature_selection/rfe.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -335,16 +335,17 @@ def fit(self, X, y, sample_weight=None):
335335

336336
# Cross-validation
337337
for n, (train, test) in enumerate(cv):
338-
X_train, y_train, sample_weight_train = _safe_split(
339-
self.estimator, X, y, sample_weight, train)
340-
X_test, y_test, sample_weight_test = _safe_split(
341-
self.estimator, X, y, sample_weight, test, train)
338+
X_train, y_train = _safe_split(
339+
self.estimator, X, y, train)
340+
X_test, y_test = _safe_split(
341+
self.estimator, X, y, test, train)
342342

343343
fit_params = dict()
344344
score_params = dict()
345345
if sample_weight is not None:
346-
fit_params['sample_weight'] = sample_weight_train
347-
score_params['sample_weight'] = sample_weight_test
346+
sample_weight = np.asarray(sample_weight)
347+
fit_params['sample_weight'] = sample_weight[train]
348+
score_params['sample_weight'] = sample_weight[test]
348349

349350
# Compute a full ranking of the features
350351
ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_

sklearn/grid_search.py

+20-32
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,8 @@ class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
281281
@abstractmethod
282282
def __init__(self, estimator, scoring=None,
283283
fit_params=None, n_jobs=1, iid=True,
284-
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
284+
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
285+
scorer_params=None):
285286

286287
self.scoring = scoring
287288
self.estimator = estimator
@@ -292,8 +293,9 @@ def __init__(self, estimator, scoring=None,
292293
self.cv = cv
293294
self.verbose = verbose
294295
self.pre_dispatch = pre_dispatch
296+
self.scorer_params = scorer_params
295297

296-
def score(self, X, y=None, sample_weight=None):
298+
def score(self, X, y=None, **scorer_params):
297299
"""Returns the score on the given test data and labels, if the search
298300
estimator has been refit. The ``score`` function of the best estimator
299301
is used, or the ``scoring`` parameter where unavailable.
@@ -308,24 +310,18 @@ def score(self, X, y=None, sample_weight=None):
308310
Target relative to X for classification or regression;
309311
None for unsupervised learning.
310312
311-
sample_weight : array-like, shape = [n_samples], optional
312-
Sample weights.
313-
314313
Returns
315314
-------
316315
score : float
317316
318317
"""
319-
kwargs = {}
320-
if sample_weight is not None:
321-
kwargs['sample_weight'] = sample_weight
322318
if hasattr(self.best_estimator_, 'score'):
323-
return self.best_estimator_.score(X, y, **kwargs)
319+
return self.best_estimator_.score(X, y, **scorer_params)
324320
if self.scorer_ is None:
325321
raise ValueError("No score function explicitly defined, "
326322
"and the estimator doesn't provide one %s"
327323
% self.best_estimator_)
328-
return self.scorer_(self.best_estimator_, X, y, **kwargs)
324+
return self.scorer_(self.best_estimator_, X, y, **scorer_params)
329325

330326
@property
331327
def predict(self):
@@ -343,15 +339,15 @@ def decision_function(self):
343339
def transform(self):
344340
return self.best_estimator_.transform
345341

346-
def _fit(self, X, y, sample_weight, parameter_iterable):
342+
def _fit(self, X, y, parameter_iterable):
347343
"""Actual fitting, performing the search over parameters."""
348344

349345
estimator = self.estimator
350346
cv = self.cv
351347
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
352348

353349
n_samples = _num_samples(X)
354-
X, y, sample_weight = indexable(X, y, sample_weight)
350+
X, y = indexable(X, y)
355351

356352
if y is not None:
357353
if len(y) != n_samples:
@@ -376,10 +372,10 @@ def _fit(self, X, y, sample_weight, parameter_iterable):
376372
n_jobs=self.n_jobs, verbose=self.verbose,
377373
pre_dispatch=pre_dispatch
378374
)(
379-
delayed(_fit_and_score)(clone(base_estimator), X, y, sample_weight,
375+
delayed(_fit_and_score)(clone(base_estimator), X, y,
380376
self.scorer_, train, test,
381377
self.verbose, parameters, self.fit_params,
382-
return_parameters=True)
378+
self.scorer_params, return_parameters=True)
383379
for parameters in parameter_iterable
384380
for train, test in cv)
385381

@@ -422,9 +418,6 @@ def _fit(self, X, y, sample_weight, parameter_iterable):
422418

423419
if self.refit:
424420
fit_params = self.fit_params
425-
if sample_weight is not None:
426-
fit_params = fit_params.copy()
427-
fit_params['sample_weight'] = sample_weight
428421
# fit the best estimator using the entire dataset
429422
# clone first to work around broken estimators
430423
best_estimator = clone(base_estimator).set_params(
@@ -580,14 +573,15 @@ class GridSearchCV(BaseSearchCV):
580573

581574
def __init__(self, estimator, param_grid, scoring=None,
582575
fit_params=None, n_jobs=1, iid=True,
583-
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'):
576+
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
577+
scorer_params=None):
584578
super(GridSearchCV, self).__init__(
585579
estimator, scoring, fit_params, n_jobs, iid,
586-
refit, cv, verbose, pre_dispatch)
580+
refit, cv, verbose, pre_dispatch, scorer_params)
587581
self.param_grid = param_grid
588582
_check_param_grid(param_grid)
589583

590-
def fit(self, X, y=None, sample_weight=None):
584+
def fit(self, X, y=None):
591585
"""Run fit with all sets of parameters.
592586
593587
Parameters
@@ -600,11 +594,8 @@ def fit(self, X, y=None, sample_weight=None):
600594
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
601595
Target relative to X for classification or regression;
602596
None for unsupervised learning.
603-
604-
sample_weight : array-like, shape = [n_samples], optional
605-
Sample weights.
606597
"""
607-
return self._fit(X, y, sample_weight, ParameterGrid(self.param_grid))
598+
return self._fit(X, y, ParameterGrid(self.param_grid))
608599

609600

610601
class RandomizedSearchCV(BaseSearchCV):
@@ -730,17 +721,18 @@ class RandomizedSearchCV(BaseSearchCV):
730721

731722
def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
732723
fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
733-
verbose=0, pre_dispatch='2*n_jobs', random_state=None):
724+
verbose=0, pre_dispatch='2*n_jobs', random_state=None,
725+
scorer_params=None):
734726

735727
self.param_distributions = param_distributions
736728
self.n_iter = n_iter
737729
self.random_state = random_state
738730
super(RandomizedSearchCV, self).__init__(
739731
estimator=estimator, scoring=scoring, fit_params=fit_params,
740732
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
741-
pre_dispatch=pre_dispatch)
733+
pre_dispatch=pre_dispatch, scorer_params=scorer_params)
742734

743-
def fit(self, X, y=None, sample_weight=None):
735+
def fit(self, X, y=None):
744736
"""Run fit on the estimator with randomly drawn parameters.
745737
746738
Parameters
@@ -752,12 +744,8 @@ def fit(self, X, y=None, sample_weight=None):
752744
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
753745
Target relative to X for classification or regression;
754746
None for unsupervised learning.
755-
756-
sample_weight : array-like, shape = [n_samples], optional
757-
Sample weights.
758-
759747
"""
760748
sampled_params = ParameterSampler(self.param_distributions,
761749
self.n_iter,
762750
random_state=self.random_state)
763-
return self._fit(X, y, sample_weight, sampled_params)
751+
return self._fit(X, y, sampled_params)

0 commit comments

Comments
 (0)