ENH Restructure grid_scores_ into future proof eff. data structure

raghavrv · raghavrv · commit cad5e6cbe51c · 2016-04-27T20:12:04.000+02:00
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     return score, parameters, n_samples_test
 
 
-def _check_param_grid(param_grid):
-    if hasattr(param_grid, 'items'):
-        param_grid = [param_grid]
+def _check_param_grid_or_dist(param_grid_or_dist):
+    """Validate param_grid/distribution and return the unique parameters"""
+    parameter_names = set()
 
-    for p in param_grid:
+    if hasattr(param_grid_or_dist, 'items'):
+        param_grid_or_dist = [param_grid_or_dist]
+
+    for p in param_grid_or_dist:
         for v in p.values():
             if isinstance(v, np.ndarray) and v.ndim > 1:
                 raise ValueError("Parameter array should be one-dimensional.")
 
-            check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
-            if True not in check:
+            if not isinstance(v, (list, tuple, np.ndarray)):
                 raise ValueError("Parameter values should be a list.")
 
             if len(v) == 0:
                 raise ValueError("Parameter values should be a non-empty "
                                  "list.")
 
+        parameter_names.update(p.keys())
+
+    return list(parameter_names)
+
+
+def _get_metric_name(scoring):
+    """Generate the metric name given the scoring parameter"""
+    if callable(scoring):
+        if scoring.__name__ == "_passthrough_scorer":
+            return "estimator_default_scorer"
+        else:
+            return "custom_metric_%s" % (scoring.__name__,)
+
+    elif isinstance(scoring, six.string_types):
+        return scoring
+
+    else:
+        raise ValueError("Unknown metric type - %r" % type(scoring))
 
+
+# XXX Remove in 0.20
 class _CVScoreTuple (namedtuple('_CVScoreTuple',
                                 ('parameters',
                                  'mean_validation_score',
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
         estimator = self.estimator
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
         self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+        self.metric_name_ = _get_metric_name(self.scorer_)
 
         n_samples = _num_samples(X)
         X, y, labels = indexable(X, y, labels)
@@ -560,44 +583,90 @@ def _fit(self, X, y, labels, parameter_iterable):
         # Out is a list of triplet: score, estimator, n_test_samples
         n_fits = len(out)
 
-        scores = list()
-        grid_scores = list()
-        for grid_start in range(0, n_fits, n_splits):
-            n_test_samples = 0
-            score = 0
-            all_scores = []
-            for this_score, this_n_test_samples, _, parameters in \
+        self.n_candidates_ = int(n_fits / n_splits)
+        self.n_parameters_ = len(self.parameter_names_)
+
+        res_shape = (self.n_candidates_,)
+
+        search_results = dict()
+
+        for param in self.parameter_names_:
+            # One column to record the values of each parameter
+            search_results[param] = np.ma.masked_all(res_shape, dtype=object)
+
+        # Lets not initite this everytime and reuse the same array.
+        all_scores = np.empty((n_splits,), dtype=np.float64)
+
+        # Loop this when multiple metric support is introduced.
+        metric = self.metric_name_
+
+        # Make a column for each split of each metric
+        for split_i in range(n_splits):
+            search_results["%s_split_%s" % (metric, split_i)] = (
+                np.empty(res_shape, dtype=np.float64))
+        search_results["%s_mean" % metric] = np.empty(res_shape,
+                                                      dtype=np.float64)
+        search_results["%s_rank" % metric] = np.empty(res_shape, dtype=int)
+
+        for fit_i, grid_start in enumerate(range(0, n_fits, n_splits)):
+            n_test_samples_total = 0
+            mean_score = 0
+
+            split_i = -1
+            for score_i, n_test_samples_i, _, parameters in \
                     out[grid_start:grid_start + n_splits]:
-                all_scores.append(this_score)
+                split_i += 1
+                # Record the score/n_test_samples for the i-th split
+                # of the current parameter setting candidate.
+                all_scores[split_i] = score_i
+
                 if self.iid:
-                    this_score *= this_n_test_samples
-                    n_test_samples += this_n_test_samples
-                score += this_score
+                    score_i *= n_test_samples_i
+                    n_test_samples_total += n_test_samples_i
+
+                mean_score += score_i
+                search_results["%s_split_%s" %
+                               (metric, split_i)][fit_i] = score_i
+
             if self.iid:
-                score /= float(n_test_samples)
+                mean_score = all_scores.sum() / float(n_test_samples_total)
             else:
-                score /= float(n_splits)
-            scores.append((score, parameters))
-            # TODO: shall we also store the test_fold_sizes?
-            grid_scores.append(_CVScoreTuple(
-                parameters,
-                score,
-                np.array(all_scores)))
-        # Store the computed scores
-        self.grid_scores_ = grid_scores
+                mean_score = all_scores.mean()
+
+            # Store the mean score and the parameters for this fit
+            search_results["%s_mean" % metric][fit_i] = mean_score
+            for param in parameters:
+                # This entry alone gets unmasked when assigned
+                search_results[param][fit_i] = parameters[param]
 
         # Find the best parameters by comparing on the mean validation score:
         # note that `sorted` is deterministic in the way it breaks ties
-        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
-                      reverse=True)[0]
-        self.best_params_ = best.parameters
-        self.best_score_ = best.mean_validation_score
+        # We reverse the order to get a descending sort order
+        sorted_indices = np.argsort(
+            search_results["%s_mean" % metric])[::-1]
+
+        search_results["%s_rank" % metric][sorted_indices] = (
+            np.arange(1, self.n_parameters_ + 2))
+
+        self.search_results_ = search_results
+
+        best = sorted_indices[0]
+
+        parameters = dict()
+
+        for param in self.parameter_names_:
+            value = search_results[param][best]
+            if value is not np.ma.masked:
+                parameters[param] = search_results[param][best]
+
+        self.best_params_ = parameters
+        self.best_score_ = search_results["%s_mean" % metric][best]
 
         if self.refit:
             # fit the best estimator using the entire dataset
             # clone first to work around broken estimators
             best_estimator = clone(base_estimator).set_params(
-                **best.parameters)
+                **parameters)
             if y is not None:
                 best_estimator.fit(X, y, **self.fit_params)
             else:
@@ -722,15 +791,32 @@ class GridSearchCV(BaseSearchCV):
 
     Attributes
     ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
+    search_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas DataFrame.
+
+        For instance the below given table
+
+        kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
+        =====================================================================
+        'poly'|  -  |  2   |           0.8           |         0.81          |
+        'poly'|  -  |  3   |           0.7           |         0.60          |
+        'rbf' | 0.1 |  -   |           0.8           |         0.75          |
+        'rbf' | 0.2 |  -   |           0.9           |         0.82          |
+
+        will be represented by a search_results_ dict of :
+
+        {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                 mask = [False False False False]...)
+         'gamma' : masked_array(data = [-- -- 0.1 0.2],
+                                mask = [ True  True False False]...),
+         'degree' : masked_array(data = [2.0 3.0 -- --],
+                                 mask = [False False  True  True]...),
+         'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
+         'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
+         'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
+         'candidate_rank' : [2, 4, 3, 1],
+        }
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +870,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score)
         self.param_grid = param_grid
-        _check_param_grid(param_grid)
+        self.parameter_names_ = _check_param_grid_or_dist(param_grid)
 
     def fit(self, X, y=None, labels=None):
         """Run fit with all sets of parameters.
@@ -918,15 +1004,32 @@ class RandomizedSearchCV(BaseSearchCV):
 
     Attributes
     ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
+    search_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas DataFrame.
+
+        For instance the below given table
+
+        kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
+        =====================================================================
+        'poly'|  -  |  2   |           0.8           |         0.81          |
+        'poly'|  -  |  3   |           0.7           |         0.60          |
+        'rbf' | 0.1 |  -   |           0.8           |         0.75          |
+        'rbf' | 0.2 |  -   |           0.9           |         0.82          |
+
+        will be represented by a search_results_ dict of :
+
+        {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                 mask = [False False False False]...)
+         'gamma' : masked_array(data = [-- -- 0.1 0.2],
+                                mask = [ True  True False False]...),
+         'degree' : masked_array(data = [2.0 3.0 -- --],
+                                 mask = [False False  True  True]...),
+         'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
+         'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
+         'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
+         'candidate_rank' : [2, 4, 3, 1],
+        }
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1072,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
                  error_score='raise'):
 
         self.param_distributions = param_distributions
+        self.parameter_names_ = _check_param_grid_or_dist(param_distributions)
         self.n_iter = n_iter
         self.random_state = random_state
         super(RandomizedSearchCV, self).__init__(