ENH Restructure grid_scores_ into future proof eff. data structure

raghavrv · raghavrv · commit 413404ef8787 · 2016-04-22T19:37:35.000+02:00
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -321,24 +321,57 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     return score, parameters, n_samples_test
 
 
-def _check_param_grid(param_grid):
-    if hasattr(param_grid, 'items'):
-        param_grid = [param_grid]
+def _check_param_grid_or_dist(param_grid_or_dist):
+    """Validate param_grid/distribution and return the unique parameters"""
+    parameter_names = set()
 
-    for p in param_grid:
+    if hasattr(param_grid_or_dist, 'items'):
+        param_grid_or_dist = [param_grid_or_dist]
+
+    for p in param_grid_or_dist:
         for v in p.values():
             if isinstance(v, np.ndarray) and v.ndim > 1:
                 raise ValueError("Parameter array should be one-dimensional.")
 
-            check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
-            if True not in check:
+            if not isinstance(v, (list, tuple, np.ndarray)):
                 raise ValueError("Parameter values should be a list.")
 
             if len(v) == 0:
                 raise ValueError("Parameter values should be a non-empty "
                                  "list.")
 
+        parameter_names.update(p.keys())
+
+    return list(parameter_names)
+
+
+def _get_metric_names(scoring):
+    """Generate the list of metric name(s) given the scoring parameter"""
+    metric_names = list()
+    # XXX Do we index from 0?
+    # NOTE we need this to prevent collisions between similarly named
+    # custom metric (i.e [foo.bar, bar])
+    n_custom_metrics = 1
 
+    if not isinstance(scoring, (list, tuple)):
+        scoring = [scoring]
+
+    for metric in scoring:
+        if callable(metric):
+            metric_names.append("custom_metric_%s_%s" %
+                                (n_custom_metrics, metric.__name__))
+            n_custom_metrics += 1
+
+        elif isinstance(metric, six.string_types):
+            metric_names.append(metric)
+
+        else:
+            raise ValueError("Unknown metric type - %r" % type(metric))
+
+    return metric_names
+
+
+# XXX Remove in 0.20
 class _CVScoreTuple (namedtuple('_CVScoreTuple',
                                 ('parameters',
                                  'mean_validation_score',
@@ -381,6 +414,7 @@ def __init__(self, estimator, scoring=None,
         self.verbose = verbose
         self.pre_dispatch = pre_dispatch
         self.error_score = error_score
+        self.metric_names_ = _get_metric_names(scoring)
 
     @property
     def _estimator_type(self):
@@ -521,6 +555,12 @@ def inverse_transform(self, Xt):
         """
         return self.best_estimator_.transform(Xt)
 
+    @property
+    @deprecated("The grid_scores_ attribute is deprecated in favor of the "
+                "search_results_ and will be removed in version 0.20.")
+    def grid_scores_(self):
+        return self._grid_scores
+
     def _fit(self, X, y, labels, parameter_iterable):
         """Actual fitting,  performing the search over parameters."""
 
@@ -561,38 +601,67 @@ def _fit(self, X, y, labels, parameter_iterable):
         # Out is a list of triplet: score, estimator, n_test_samples
         n_fits = len(out)
 
-        scores = list()
-        grid_scores = list()
-        for grid_start in range(0, n_fits, n_splits):
-            n_test_samples = 0
-            score = 0
-            all_scores = []
-            for this_score, this_n_test_samples, _, parameters in \
-                    out[grid_start:grid_start + n_splits]:
-                all_scores.append(this_score)
+        self._grid_scores = list()
+
+        # XXX Do we want to store these?
+        n_candidates = n_fits / n_splits
+        n_parameters = len(self.parameter_names_)
+        n_metrics = len(scoring)
+
+        search_results_ = dict()
+
+        for param in self.parameter_names_:
+            search_results_[param] = np.empty((n_candidates,), dtype=object)
+
+        for metric in self.metric_names_:
+            # Make a column for each split
+            # XXX To make it future proof
+            for split_i in range(n_splits)]:
+                search_results_["%s_split_%s" % (metric, split_i)] = (
+                    np.empty((n_candidates,), dtype=np.float32))
+
+            search_results_["%s_aggregated"] = np.empty((n_candidates,),
+                                                        dtype=np.float32)
+            search_results_["%s_rank"] = np.empty((n_candidates,), dtype=int)
+
+            for grid_start in range(0, n_fits, n_splits):
+                n_test_samples = 0
+                aggregated_score = 0
+                all_scores = []
+
+                # XXX Loop this when multiple metric support is enabled
+                for (this_score, this_n_test_samples, _, parameters), i in \
+                        enumerate(out[grid_start:grid_start + n_splits]):
+                    all_scores.append(this_score)
+
+                    if self.iid:
+                        this_score *= this_n_test_samples
+                        n_test_samples += this_n_test_samples
+                    aggregated_score += this_score
+                    search_results_["%s_split_%s" % (metric, i)] = this_score
+
                 if self.iid:
-                    this_score *= this_n_test_samples
-                    n_test_samples += this_n_test_samples
-                score += this_score
-            if self.iid:
-                score /= float(n_test_samples)
-            else:
-                score /= float(n_splits)
-            scores.append((score, parameters))
-            # TODO: shall we also store the test_fold_sizes?
-            grid_scores.append(_CVScoreTuple(
+                    aggregated_score /= float(n_test_samples)
+                else:
+                    aggregated_score /= float(n_splits)
+
+                search_results_["%s_aggregated" % metric] = aggregated_score
+
+            # XXX Remove in version 0.20
+            self._grid_scores.append(_CVScoreTuple(
                 parameters,
                 score,
                 np.array(all_scores)))
-        # Store the computed scores
-        self.grid_scores_ = grid_scores
 
-        # Find the best parameters by comparing on the mean validation score:
-        # note that `sorted` is deterministic in the way it breaks ties
-        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
-                      reverse=True)[0]
-        self.best_params_ = best.parameters
-        self.best_score_ = best.mean_validation_score
+            # Find the best parameters by comparing on the mean validation score:
+            # note that `sorted` is deterministic in the way it breaks ties
+            np.argsort(search_results_["%s"])
+            search_results_["%s_aggregated" % metric] = aggregated_score
+
+            best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
+                          reverse=True)[0]
+            self.best_params_ = best.parameters
+            self.best_score_ = best.mean_validation_score
 
         if self.refit:
             # fit the best estimator using the entire dataset
@@ -723,15 +792,32 @@ class GridSearchCV(BaseSearchCV):
 
     Attributes
     ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
+    search_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas DataFrame.
+
+        For instance the below given table
+
+        kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
+        =====================================================================
+        'poly'|  -  |  2   |           0.8           |         0.81          |
+        'poly'|  -  |  3   |           0.7           |         0.60          |
+        'rbf' | 0.1 |  -   |           0.8           |         0.75          |
+        'rbf' | 0.2 |  -   |           0.9           |         0.82          |
+
+        will be represented by a search_results_ dict of :
+
+        {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                 mask = [False False False False]...)
+         'gamma' : masked_array(data = [-- -- 0.1 0.2],
+                                mask = [ True  True False False]...),
+         'degree' : masked_array(data = [2.0 3.0 -- --],
+                                 mask = [False False  True  True]...),
+         'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
+         'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
+         'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
+         'candidate_rank' : [2, 4, 3, 1],
+        }
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -785,7 +871,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score)
         self.param_grid = param_grid
-        _check_param_grid(param_grid)
+        self.parameter_names_ = _check_param_grid_or_dist(param_grid)
 
     def fit(self, X, y=None, labels=None):
         """Run fit with all sets of parameters.
@@ -919,15 +1005,32 @@ class RandomizedSearchCV(BaseSearchCV):
 
     Attributes
     ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
+    search_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be
+        imported into a pandas DataFrame.
+
+        For instance the below given table
+
+        kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
+        =====================================================================
+        'poly'|  -  |  2   |           0.8           |         0.81          |
+        'poly'|  -  |  3   |           0.7           |         0.60          |
+        'rbf' | 0.1 |  -   |           0.8           |         0.75          |
+        'rbf' | 0.2 |  -   |           0.9           |         0.82          |
+
+        will be represented by a search_results_ dict of :
+
+        {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                 mask = [False False False False]...)
+         'gamma' : masked_array(data = [-- -- 0.1 0.2],
+                                mask = [ True  True False False]...),
+         'degree' : masked_array(data = [2.0 3.0 -- --],
+                                 mask = [False False  True  True]...),
+         'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
+         'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
+         'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
+         'candidate_rank' : [2, 4, 3, 1],
+        }
 
     best_estimator_ : estimator
         Estimator that was chosen by the search, i.e. estimator
@@ -970,6 +1073,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
                  error_score='raise'):
 
         self.param_distributions = param_distributions
+        self.parameter_names_ = _check_param_grid_or_dist(param_distributions)
         self.n_iter = n_iter
         self.random_state = random_state
         super(RandomizedSearchCV, self).__init__(