@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
320320 return score , parameters , n_samples_test
321321
322322
323- def _check_param_grid ( param_grid ):
324- if hasattr ( param_grid , 'items' ):
325- param_grid = [ param_grid ]
323+ def _check_param_grid_or_dist ( param_grid_or_dist ):
324+ """Validate param_grid/distribution and return the unique parameters"""
325+ parameter_names = set ()
326326
327- for p in param_grid :
327+ if hasattr (param_grid_or_dist , 'items' ):
328+ param_grid_or_dist = [param_grid_or_dist ]
329+
330+ for p in param_grid_or_dist :
328331 for v in p .values ():
329332 if isinstance (v , np .ndarray ) and v .ndim > 1 :
330333 raise ValueError ("Parameter array should be one-dimensional." )
331334
332- check = [isinstance (v , k ) for k in (list , tuple , np .ndarray )]
333- if True not in check :
335+ if not isinstance (v , (list , tuple , np .ndarray )):
334336 raise ValueError ("Parameter values should be a list." )
335337
336338 if len (v ) == 0 :
337339 raise ValueError ("Parameter values should be a non-empty "
338340 "list." )
339341
342+ parameter_names .update (p .keys ())
343+
344+ return list (parameter_names )
345+
346+
347+ def _get_metric_name (scoring ):
348+ """Generate the metric name given the scoring parameter"""
349+ if callable (scoring ):
350+ if scoring .__name__ == "_passthrough_scorer" :
351+ return "estimator_default_scorer"
352+ else :
353+ return "custom_metric_%s" % (scoring .__name__ ,)
354+
355+ elif isinstance (scoring , six .string_types ):
356+ return scoring
357+
358+ else :
359+ raise ValueError ("Unknown metric type - %r" % type (scoring ))
340360
361+
362+ # XXX Remove in 0.20
341363class _CVScoreTuple (namedtuple ('_CVScoreTuple' ,
342364 ('parameters' ,
343365 'mean_validation_score' ,
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
526548 estimator = self .estimator
527549 cv = check_cv (self .cv , y , classifier = is_classifier (estimator ))
528550 self .scorer_ = check_scoring (self .estimator , scoring = self .scoring )
551+ self .metric_name_ = _get_metric_name (self .scorer_ )
529552
530553 n_samples = _num_samples (X )
531554 X , y , labels = indexable (X , y , labels )
@@ -560,44 +583,90 @@ def _fit(self, X, y, labels, parameter_iterable):
560583 # Out is a list of triplet: score, estimator, n_test_samples
561584 n_fits = len (out )
562585
563- scores = list ()
564- grid_scores = list ()
565- for grid_start in range (0 , n_fits , n_splits ):
566- n_test_samples = 0
567- score = 0
568- all_scores = []
569- for this_score , this_n_test_samples , _ , parameters in \
586+ self .n_candidates_ = int (n_fits / n_splits )
587+ self .n_parameters_ = len (self .parameter_names_ )
588+
589+ res_shape = (self .n_candidates_ ,)
590+
591+ search_results = dict ()
592+
593+ for param in self .parameter_names_ :
594+ # One column to record the values of each parameter
595+ search_results [param ] = np .ma .masked_all (res_shape , dtype = object )
596+
597+ # Lets not initite this everytime and reuse the same array.
598+ all_scores = np .empty ((n_splits ,), dtype = np .float64 )
599+
600+ # Loop this when multiple metric support is introduced.
601+ metric = self .metric_name_
602+
603+ # Make a column for each split of each metric
604+ for split_i in range (n_splits ):
605+ search_results ["%s_split_%s" % (metric , split_i )] = (
606+ np .empty (res_shape , dtype = np .float64 ))
607+ search_results ["%s_mean" % metric ] = np .empty (res_shape ,
608+ dtype = np .float64 )
609+ search_results ["%s_rank" % metric ] = np .empty (res_shape , dtype = int )
610+
611+ for fit_i , grid_start in enumerate (range (0 , n_fits , n_splits )):
612+ n_test_samples_total = 0
613+ mean_score = 0
614+
615+ split_i = - 1
616+ for score_i , n_test_samples_i , _ , parameters in \
570617 out [grid_start :grid_start + n_splits ]:
571- all_scores .append (this_score )
618+ split_i += 1
619+ # Record the score/n_test_samples for the i-th split
620+ # of the current parameter setting candidate.
621+ all_scores [split_i ] = score_i
622+
572623 if self .iid :
573- this_score *= this_n_test_samples
574- n_test_samples += this_n_test_samples
575- score += this_score
624+ score_i *= n_test_samples_i
625+ n_test_samples_total += n_test_samples_i
626+
627+ mean_score += score_i
628+ search_results ["%s_split_%s" %
629+ (metric , split_i )][fit_i ] = score_i
630+
576631 if self .iid :
577- score /= float (n_test_samples )
632+ mean_score = all_scores . sum () / float (n_test_samples_total )
578633 else :
579- score /= float (n_splits )
580- scores .append ((score , parameters ))
581- # TODO: shall we also store the test_fold_sizes?
582- grid_scores .append (_CVScoreTuple (
583- parameters ,
584- score ,
585- np .array (all_scores )))
586- # Store the computed scores
587- self .grid_scores_ = grid_scores
634+ mean_score = all_scores .mean ()
635+
636+ # Store the mean score and the parameters for this fit
637+ search_results ["%s_mean" % metric ][fit_i ] = mean_score
638+ for param in parameters :
639+ # This entry alone gets unmasked when assigned
640+ search_results [param ][fit_i ] = parameters [param ]
588641
589642 # Find the best parameters by comparing on the mean validation score:
590643 # note that `sorted` is deterministic in the way it breaks ties
591- best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
592- reverse = True )[0 ]
593- self .best_params_ = best .parameters
594- self .best_score_ = best .mean_validation_score
644+ # We reverse the order to get a descending sort order
645+ sorted_indices = np .argsort (
646+ search_results ["%s_mean" % metric ])[::- 1 ]
647+
648+ search_results ["%s_rank" % metric ][sorted_indices ] = (
649+ np .arange (1 , self .n_parameters_ + 2 ))
650+
651+ self .search_results_ = search_results
652+
653+ best = sorted_indices [0 ]
654+
655+ parameters = dict ()
656+
657+ for param in self .parameter_names_ :
658+ value = search_results [param ][best ]
659+ if value is not np .ma .masked :
660+ parameters [param ] = search_results [param ][best ]
661+
662+ self .best_params_ = parameters
663+ self .best_score_ = search_results ["%s_mean" % metric ][best ]
595664
596665 if self .refit :
597666 # fit the best estimator using the entire dataset
598667 # clone first to work around broken estimators
599668 best_estimator = clone (base_estimator ).set_params (
600- ** best . parameters )
669+ ** parameters )
601670 if y is not None :
602671 best_estimator .fit (X , y , ** self .fit_params )
603672 else :
@@ -722,15 +791,32 @@ class GridSearchCV(BaseSearchCV):
722791
723792 Attributes
724793 ----------
725- grid_scores_ : list of named tuples
726- Contains scores for all parameter combinations in param_grid.
727- Each entry corresponds to one parameter setting.
728- Each named tuple has the attributes:
729-
730- * ``parameters``, a dict of parameter settings
731- * ``mean_validation_score``, the mean score over the
732- cross-validation folds
733- * ``cv_validation_scores``, the list of scores for each fold
794+ search_results_ : dict of numpy (masked) ndarrays
795+ A dict with keys as column headers and values as columns, that can be
796+ imported into a pandas DataFrame.
797+
798+ For instance the below given table
799+
800+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
801+ =====================================================================
802+ 'poly'| - | 2 | 0.8 | 0.81 |
803+ 'poly'| - | 3 | 0.7 | 0.60 |
804+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
805+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
806+
807+ will be represented by a search_results_ dict of :
808+
809+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
810+ mask = [False False False False]...)
811+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
812+ mask = [ True True False False]...),
813+ 'degree' : masked_array(data = [2.0 3.0 -- --],
814+ mask = [False False True True]...),
815+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
816+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
817+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
818+ 'candidate_rank' : [2, 4, 3, 1],
819+ }
734820
735821 best_estimator_ : estimator
736822 Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +870,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
784870 n_jobs = n_jobs , iid = iid , refit = refit , cv = cv , verbose = verbose ,
785871 pre_dispatch = pre_dispatch , error_score = error_score )
786872 self .param_grid = param_grid
787- _check_param_grid (param_grid )
873+ self . parameter_names_ = _check_param_grid_or_dist (param_grid )
788874
789875 def fit (self , X , y = None , labels = None ):
790876 """Run fit with all sets of parameters.
@@ -918,15 +1004,32 @@ class RandomizedSearchCV(BaseSearchCV):
9181004
9191005 Attributes
9201006 ----------
921- grid_scores_ : list of named tuples
922- Contains scores for all parameter combinations in param_grid.
923- Each entry corresponds to one parameter setting.
924- Each named tuple has the attributes:
925-
926- * ``parameters``, a dict of parameter settings
927- * ``mean_validation_score``, the mean score over the
928- cross-validation folds
929- * ``cv_validation_scores``, the list of scores for each fold
1007+ search_results_ : dict of numpy (masked) ndarrays
1008+ A dict with keys as column headers and values as columns, that can be
1009+ imported into a pandas DataFrame.
1010+
1011+ For instance the below given table
1012+
1013+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1014+ =====================================================================
1015+ 'poly'| - | 2 | 0.8 | 0.81 |
1016+ 'poly'| - | 3 | 0.7 | 0.60 |
1017+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
1018+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
1019+
1020+ will be represented by a search_results_ dict of :
1021+
1022+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1023+ mask = [False False False False]...)
1024+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
1025+ mask = [ True True False False]...),
1026+ 'degree' : masked_array(data = [2.0 3.0 -- --],
1027+ mask = [False False True True]...),
1028+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1029+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1030+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1031+ 'candidate_rank' : [2, 4, 3, 1],
1032+ }
9301033
9311034 best_estimator_ : estimator
9321035 Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1072,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
9691072 error_score = 'raise' ):
9701073
9711074 self .param_distributions = param_distributions
1075+ self .parameter_names_ = _check_param_grid_or_dist (param_distributions )
9721076 self .n_iter = n_iter
9731077 self .random_state = random_state
9741078 super (RandomizedSearchCV , self ).__init__ (
0 commit comments