From 40fc72271834cacff7955ce4949f2d6512f80dc5 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Thu, 5 Jul 2018 19:05:22 -0300 Subject: [PATCH 1/3] Refactor pipeline namespace to make it more reusable --- sklearn/pipeline.py | 119 ++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1e99dd54615ae..8853612c6eeda 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -26,6 +26,33 @@ __all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union'] +# weight and fit_params are not used but it allows _fit_one_transformer, +# _transform_one and _fit_transform_one to have the same signature to +# factorize the code in ColumnTransformer +def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): + return transformer.fit(X, y) + + +def _apply_weight(Xt, weight=None): + # if we have a weight for this transformer, multiply output + if weight is None: + return Xt + return Xt * weight + + +def _transform_one(transformer, X, y, weight, **fit_params): + res = transformer.transform(X) + return _apply_weight(res, weight) + + +def _fit_transform_one(transformer, X, y, weight, **fit_params): + if hasattr(transformer, 'fit_transform'): + res = transformer.fit_transform(X, y, **fit_params) + else: + res = transformer.fit(X, y, **fit_params).transform(X) + return _apply_weight(res, weight), transformer + + class Pipeline(_BaseComposition): """Pipeline of transforms with a final estimator. @@ -147,6 +174,11 @@ def set_params(self, **kwargs): self._set_params('steps', **kwargs) return self + @property + def _fit_transform_one(self): + # property needed because `memory.cache` doesn't accept class methods + return _fit_transform_one + def _validate_steps(self): names, estimators = zip(*self.steps) @@ -194,7 +226,7 @@ def _fit(self, X, y=None, **fit_params): # Setup the memory memory = check_memory(self.memory) - fit_transform_one_cached = memory.cache(_fit_transform_one) + fit_transform_one_cached = memory.cache(self._fit_transform_one) fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) @@ -582,32 +614,6 @@ def make_pipeline(*steps, **kwargs): return Pipeline(_name_estimators(steps), memory=memory) -# weight and fit_params are not used but it allows _fit_one_transformer, -# _transform_one and _fit_transform_one to have the same signature to -# factorize the code in ColumnTransformer -def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): - return transformer.fit(X, y) - - -def _transform_one(transformer, X, y, weight, **fit_params): - res = transformer.transform(X) - # if we have a weight for this transformer, multiply output - if weight is None: - return res - return res * weight - - -def _fit_transform_one(transformer, X, y, weight, **fit_params): - if hasattr(transformer, 'fit_transform'): - res = transformer.fit_transform(X, y, **fit_params) - else: - res = transformer.fit(X, y, **fit_params).transform(X) - # if we have a weight for this transformer, multiply output - if weight is None: - return res, transformer - return res * weight, transformer - - class FeatureUnion(_BaseComposition, TransformerMixin): """Concatenates results of multiple transformer objects. @@ -685,6 +691,15 @@ def set_params(self, **kwargs): self._set_params('transformer_list', **kwargs) return self + def _validate_one_transformer(self, t): + if t is None: + return + if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not + hasattr(t, "transform")): + raise TypeError("All estimators should implement fit and " + "transform. '%s' (type %s) doesn't" % + (t, type(t))) + def _validate_transformers(self): names, transformers = zip(*self.transformer_list) @@ -693,13 +708,7 @@ def _validate_transformers(self): # validate estimators for t in transformers: - if t is None: - continue - if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not - hasattr(t, "transform")): - raise TypeError("All estimators should implement fit and " - "transform. '%s' (type %s) doesn't" % - (t, type(t))) + self._validate_one_transformer(t) def _iter(self): """ @@ -710,6 +719,18 @@ def _iter(self): for name, trans in self.transformer_list if trans is not None) + @property + def _fit_one_transformer(self): + return _fit_one_transformer + + @property + def _fit_transform_one(self): + return _fit_transform_one + + @property + def _transform_one(self): + return _transform_one + def get_feature_names(self): """Get feature names from all transformers. @@ -747,11 +768,18 @@ def fit(self, X, y=None): self.transformer_list = list(self.transformer_list) self._validate_transformers() transformers = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_one_transformer)(trans, X, y) + delayed(self._fit_one_transformer)(trans, X, y) for _, trans, _ in self._iter()) self._update_transformer_list(transformers) return self + def _stack_results(self, Xs): + if any(sparse.issparse(f) for f in Xs): + Xs = sparse.hstack(Xs).tocsr() + else: + Xs = np.hstack(Xs) + return Xs + def fit_transform(self, X, y=None, **fit_params): """Fit all transformers, transform the data and concatenate results. @@ -771,8 +799,8 @@ def fit_transform(self, X, y=None, **fit_params): """ self._validate_transformers() result = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_transform_one)(trans, X, y, weight, - **fit_params) + delayed(self._fit_transform_one)(trans, X, y, weight, + **fit_params) for name, trans, weight in self._iter()) if not result: @@ -780,11 +808,8 @@ def fit_transform(self, X, y=None, **fit_params): return np.zeros((X.shape[0], 0)) Xs, transformers = zip(*result) self._update_transformer_list(transformers) - if any(sparse.issparse(f) for f in Xs): - Xs = sparse.hstack(Xs).tocsr() - else: - Xs = np.hstack(Xs) - return Xs + + return self._stack_results(Xs) def transform(self, X): """Transform X separately by each transformer, concatenate results. @@ -801,16 +826,14 @@ def transform(self, X): sum of n_components (output dimension) over transformers. """ Xs = Parallel(n_jobs=self.n_jobs)( - delayed(_transform_one)(trans, X, None, weight) + delayed(self._transform_one)(trans, X, None, weight) for name, trans, weight in self._iter()) + if not Xs: # All transformers are None return np.zeros((X.shape[0], 0)) - if any(sparse.issparse(f) for f in Xs): - Xs = sparse.hstack(Xs).tocsr() - else: - Xs = np.hstack(Xs) - return Xs + + return self._stack_results(Xs) def _update_transformer_list(self, transformers): transformers = iter(transformers) From b946c332dd1f23bcf19719c9c899a1d7f575ba66 Mon Sep 17 00:00:00 2001 From: caio Date: Tue, 10 Jul 2018 19:36:22 -0300 Subject: [PATCH 2/3] Properties -> staticmethods --- sklearn/pipeline.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8853612c6eeda..6dab5ef922b62 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -139,6 +139,8 @@ class Pipeline(_BaseComposition): False, False]) """ + _fit_transform_one = staticmethod(_fit_transform_one) + # BaseEstimator interface def __init__(self, steps, memory=None): @@ -174,11 +176,6 @@ def set_params(self, **kwargs): self._set_params('steps', **kwargs) return self - @property - def _fit_transform_one(self): - # property needed because `memory.cache` doesn't accept class methods - return _fit_transform_one - def _validate_steps(self): names, estimators = zip(*self.steps) @@ -657,6 +654,10 @@ class FeatureUnion(_BaseComposition, TransformerMixin): array([[ 1.5 , 3.0..., 0.8...], [-1.5 , 5.7..., -0.4...]]) """ + _fit_one_transformer = staticmethod(_fit_one_transformer) + _fit_transform_one = staticmethod(_fit_transform_one) + _transform_one = staticmethod(_transform_one) + def __init__(self, transformer_list, n_jobs=1, transformer_weights=None): self.transformer_list = transformer_list self.n_jobs = n_jobs @@ -719,18 +720,6 @@ def _iter(self): for name, trans in self.transformer_list if trans is not None) - @property - def _fit_one_transformer(self): - return _fit_one_transformer - - @property - def _fit_transform_one(self): - return _fit_transform_one - - @property - def _transform_one(self): - return _transform_one - def get_feature_names(self): """Get feature names from all transformers. @@ -773,7 +762,8 @@ def fit(self, X, y=None): self._update_transformer_list(transformers) return self - def _stack_results(self, Xs): + @staticmethod + def _stack_results(Xs): if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: From 0fcb79dcda00f6e586c526e78cf96759d7a6f907 Mon Sep 17 00:00:00 2001 From: caio Date: Tue, 10 Jul 2018 21:57:58 -0300 Subject: [PATCH 3/3] rename method --- sklearn/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6dab5ef922b62..6a0e034613afe 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -763,7 +763,7 @@ def fit(self, X, y=None): return self @staticmethod - def _stack_results(Xs): + def _stack_arrays(Xs): if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: @@ -799,7 +799,7 @@ def fit_transform(self, X, y=None, **fit_params): Xs, transformers = zip(*result) self._update_transformer_list(transformers) - return self._stack_results(Xs) + return self._stack_arrays(Xs) def transform(self, X): """Transform X separately by each transformer, concatenate results. @@ -823,7 +823,7 @@ def transform(self, X): # All transformers are None return np.zeros((X.shape[0], 0)) - return self._stack_results(Xs) + return self._stack_arrays(Xs) def _update_transformer_list(self, transformers): transformers = iter(transformers)