@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
320320 return score , parameters , n_samples_test
321321
322322
323- def _check_param_grid ( param_grid ):
324- if hasattr ( param_grid , 'items' ):
325- param_grid = [ param_grid ]
323+ def _check_param_grid_or_dist ( param_grid_or_dist ):
324+ """Validate param_grid/distribution and return the unique parameters"""
325+ parameter_names = set ()
326326
327- for p in param_grid :
327+ if hasattr (param_grid_or_dist , 'items' ):
328+ param_grid_or_dist = [param_grid_or_dist ]
329+
330+ for p in param_grid_or_dist :
328331 for v in p .values ():
329332 if isinstance (v , np .ndarray ) and v .ndim > 1 :
330333 raise ValueError ("Parameter array should be one-dimensional." )
331334
332- check = [isinstance (v , k ) for k in (list , tuple , np .ndarray )]
333- if True not in check :
335+ if not isinstance (v , (list , tuple , np .ndarray )):
334336 raise ValueError ("Parameter values should be a list." )
335337
336338 if len (v ) == 0 :
337339 raise ValueError ("Parameter values should be a non-empty "
338340 "list." )
339341
342+ parameter_names .update (p .keys ())
343+
344+ return list (parameter_names )
345+
346+
347+ def _get_metric_name (scoring ):
348+ """Generate the metric name given the scoring parameter"""
349+ if callable (scoring ):
350+ if scoring .__name__ == "_passthrough_scorer" :
351+ return "estimator_default_scorer"
352+ else :
353+ return "custom_metric_%s" % (scoring .__name__ ,)
354+
355+ elif isinstance (scoring , six .string_types ):
356+ return scoring
357+
358+ else :
359+ raise ValueError ("Unknown metric type - %r" % type (scoring ))
340360
361+
362+ # XXX Remove in 0.20
341363class _CVScoreTuple (namedtuple ('_CVScoreTuple' ,
342364 ('parameters' ,
343365 'mean_validation_score' ,
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
526548 estimator = self .estimator
527549 cv = check_cv (self .cv , y , classifier = is_classifier (estimator ))
528550 self .scorer_ = check_scoring (self .estimator , scoring = self .scoring )
551+ self .metric_name_ = _get_metric_name (self .scorer_ )
529552
530553 n_samples = _num_samples (X )
531554 X , y , labels = indexable (X , y , labels )
@@ -560,44 +583,91 @@ def _fit(self, X, y, labels, parameter_iterable):
560583 # Out is a list of triplet: score, estimator, n_test_samples
561584 n_fits = len (out )
562585
563- scores = list ()
564- grid_scores = list ()
565- for grid_start in range (0 , n_fits , n_splits ):
566- n_test_samples = 0
567- score = 0
568- all_scores = []
569- for this_score , this_n_test_samples , _ , parameters in \
586+ self .n_candidates_ = int (n_fits / n_splits )
587+ self .n_parameters_ = len (self .parameter_names_ )
588+
589+ res_shape = (self .n_candidates_ ,)
590+
591+ search_results = dict ()
592+
593+ # Lets not initite this everytime and reuse the same array.
594+ all_scores = np .empty ((n_splits ,), dtype = np .float64 )
595+
596+ # Loop this when multiple metric support is introduced.
597+ metric = self .metric_name_
598+
599+ for param in self .parameter_names_ :
600+ # One column to record the values of each parameter
601+ search_results ["param_%s" % param ] = (
602+ np .ma .masked_all (res_shape , dtype = object ))
603+
604+ # Make a column for each split of each metric
605+ for split_i in range (n_splits ):
606+ search_results ["%s_split_%s" % (metric , split_i )] = (
607+ np .empty (res_shape , dtype = np .float64 ))
608+ search_results ["%s_mean" % metric ] = np .empty (res_shape ,
609+ dtype = np .float64 )
610+ search_results ["%s_rank" % metric ] = np .empty (res_shape , dtype = int )
611+
612+ for fit_i , grid_start in enumerate (range (0 , n_fits , n_splits )):
613+ n_test_samples_total = 0
614+ mean_score = 0
615+
616+ split_i = - 1
617+ for score_i , n_test_samples_i , _ , parameters in \
570618 out [grid_start :grid_start + n_splits ]:
571- all_scores .append (this_score )
619+ split_i += 1
620+ # Record the score/n_test_samples for the i-th split
621+ # of the current parameter setting candidate.
622+ all_scores [split_i ] = score_i
623+
572624 if self .iid :
573- this_score *= this_n_test_samples
574- n_test_samples += this_n_test_samples
575- score += this_score
625+ score_i *= n_test_samples_i
626+ n_test_samples_total += n_test_samples_i
627+
628+ mean_score += score_i
629+ search_results ["%s_split_%s" %
630+ (metric , split_i )][fit_i ] = score_i
631+
576632 if self .iid :
577- score /= float (n_test_samples )
633+ mean_score = all_scores . sum () / float (n_test_samples_total )
578634 else :
579- score /= float (n_splits )
580- scores .append ((score , parameters ))
581- # TODO: shall we also store the test_fold_sizes?
582- grid_scores .append (_CVScoreTuple (
583- parameters ,
584- score ,
585- np .array (all_scores )))
586- # Store the computed scores
587- self .grid_scores_ = grid_scores
635+ mean_score = all_scores .mean ()
636+
637+ # Store the mean score and the parameters for this fit
638+ search_results ["%s_mean" % metric ][fit_i ] = mean_score
639+ for param in parameters :
640+ # This entry alone gets unmasked when assigned
641+ search_results ["param_%s" % param ][fit_i ] = parameters [param ]
588642
589643 # Find the best parameters by comparing on the mean validation score:
590644 # note that `sorted` is deterministic in the way it breaks ties
591- best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
592- reverse = True )[0 ]
593- self .best_params_ = best .parameters
594- self .best_score_ = best .mean_validation_score
645+ # We reverse the order to get a descending sort order
646+ sorted_indices = np .argsort (
647+ search_results ["%s_mean" % metric ])[::- 1 ]
648+
649+ search_results ["%s_rank" % metric ][sorted_indices ] = (
650+ np .arange (1 , self .n_parameters_ + 2 ))
651+
652+ self .search_results_ = search_results
653+
654+ best = sorted_indices [0 ]
655+
656+ parameters = dict ()
657+
658+ for param in self .parameter_names_ :
659+ value = search_results ["param_%s" % param ][best ]
660+ if value is not np .ma .masked :
661+ parameters [param ] = value
662+
663+ self .best_params_ = parameters
664+ self .best_score_ = search_results ["%s_mean" % metric ][best ]
595665
596666 if self .refit :
597667 # fit the best estimator using the entire dataset
598668 # clone first to work around broken estimators
599669 best_estimator = clone (base_estimator ).set_params (
600- ** best . parameters )
670+ ** parameters )
601671 if y is not None :
602672 best_estimator .fit (X , y , ** self .fit_params )
603673 else :
@@ -722,15 +792,32 @@ class GridSearchCV(BaseSearchCV):
722792
723793 Attributes
724794 ----------
725- grid_scores_ : list of named tuples
726- Contains scores for all parameter combinations in param_grid.
727- Each entry corresponds to one parameter setting.
728- Each named tuple has the attributes:
729-
730- * ``parameters``, a dict of parameter settings
731- * ``mean_validation_score``, the mean score over the
732- cross-validation folds
733- * ``cv_validation_scores``, the list of scores for each fold
795+ search_results_ : dict of numpy (masked) ndarrays
796+ A dict with keys as column headers and values as columns, that can be
797+ imported into a pandas DataFrame.
798+
799+ For instance the below given table
800+
801+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
802+ =====================================================================
803+ 'poly'| - | 2 | 0.8 | 0.81 |
804+ 'poly'| - | 3 | 0.7 | 0.60 |
805+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
806+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
807+
808+ will be represented by a search_results_ dict of :
809+
810+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
811+ mask = [False False False False]...)
812+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
813+ mask = [ True True False False]...),
814+ 'degree' : masked_array(data = [2.0 3.0 -- --],
815+ mask = [False False True True]...),
816+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
817+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
818+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
819+ 'candidate_rank' : [2, 4, 3, 1],
820+ }
734821
735822 best_estimator_ : estimator
736823 Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +871,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
784871 n_jobs = n_jobs , iid = iid , refit = refit , cv = cv , verbose = verbose ,
785872 pre_dispatch = pre_dispatch , error_score = error_score )
786873 self .param_grid = param_grid
787- _check_param_grid (param_grid )
874+ self . parameter_names_ = _check_param_grid_or_dist (param_grid )
788875
789876 def fit (self , X , y = None , labels = None ):
790877 """Run fit with all sets of parameters.
@@ -918,15 +1005,32 @@ class RandomizedSearchCV(BaseSearchCV):
9181005
9191006 Attributes
9201007 ----------
921- grid_scores_ : list of named tuples
922- Contains scores for all parameter combinations in param_grid.
923- Each entry corresponds to one parameter setting.
924- Each named tuple has the attributes:
925-
926- * ``parameters``, a dict of parameter settings
927- * ``mean_validation_score``, the mean score over the
928- cross-validation folds
929- * ``cv_validation_scores``, the list of scores for each fold
1008+ search_results_ : dict of numpy (masked) ndarrays
1009+ A dict with keys as column headers and values as columns, that can be
1010+ imported into a pandas DataFrame.
1011+
1012+ For instance the below given table
1013+
1014+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1015+ =====================================================================
1016+ 'poly'| - | 2 | 0.8 | 0.81 |
1017+ 'poly'| - | 3 | 0.7 | 0.60 |
1018+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
1019+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
1020+
1021+ will be represented by a search_results_ dict of :
1022+
1023+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1024+ mask = [False False False False]...)
1025+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
1026+ mask = [ True True False False]...),
1027+ 'degree' : masked_array(data = [2.0 3.0 -- --],
1028+ mask = [False False True True]...),
1029+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1030+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1031+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1032+ 'candidate_rank' : [2, 4, 3, 1],
1033+ }
9301034
9311035 best_estimator_ : estimator
9321036 Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1073,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
9691073 error_score = 'raise' ):
9701074
9711075 self .param_distributions = param_distributions
1076+ self .parameter_names_ = _check_param_grid_or_dist (param_distributions )
9721077 self .n_iter = n_iter
9731078 self .random_state = random_state
9741079 super (RandomizedSearchCV , self ).__init__ (
0 commit comments