-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG] Refactor pipeline namespace to make it more reusable #11446
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,33 @@ | |
__all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union'] | ||
|
||
|
||
# weight and fit_params are not used but it allows _fit_one_transformer, | ||
# _transform_one and _fit_transform_one to have the same signature to | ||
# factorize the code in ColumnTransformer | ||
def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): | ||
return transformer.fit(X, y) | ||
|
||
|
||
def _apply_weight(Xt, weight=None): | ||
# if we have a weight for this transformer, multiply output | ||
if weight is None: | ||
return Xt | ||
return Xt * weight | ||
|
||
|
||
def _transform_one(transformer, X, y, weight, **fit_params): | ||
res = transformer.transform(X) | ||
return _apply_weight(res, weight) | ||
|
||
|
||
def _fit_transform_one(transformer, X, y, weight, **fit_params): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It suppose it doesn't work if you leave |
||
if hasattr(transformer, 'fit_transform'): | ||
res = transformer.fit_transform(X, y, **fit_params) | ||
else: | ||
res = transformer.fit(X, y, **fit_params).transform(X) | ||
return _apply_weight(res, weight), transformer | ||
|
||
|
||
class Pipeline(_BaseComposition): | ||
"""Pipeline of transforms with a final estimator. | ||
|
||
|
@@ -112,6 +139,8 @@ class Pipeline(_BaseComposition): | |
False, False]) | ||
""" | ||
|
||
_fit_transform_one = staticmethod(_fit_transform_one) | ||
|
||
# BaseEstimator interface | ||
|
||
def __init__(self, steps, memory=None): | ||
|
@@ -194,7 +223,7 @@ def _fit(self, X, y=None, **fit_params): | |
# Setup the memory | ||
memory = check_memory(self.memory) | ||
|
||
fit_transform_one_cached = memory.cache(_fit_transform_one) | ||
fit_transform_one_cached = memory.cache(self._fit_transform_one) | ||
|
||
fit_params_steps = dict((name, {}) for name, step in self.steps | ||
if step is not None) | ||
|
@@ -582,32 +611,6 @@ def make_pipeline(*steps, **kwargs): | |
return Pipeline(_name_estimators(steps), memory=memory) | ||
|
||
|
||
# weight and fit_params are not used but it allows _fit_one_transformer, | ||
# _transform_one and _fit_transform_one to have the same signature to | ||
# factorize the code in ColumnTransformer | ||
def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): | ||
return transformer.fit(X, y) | ||
|
||
|
||
def _transform_one(transformer, X, y, weight, **fit_params): | ||
res = transformer.transform(X) | ||
# if we have a weight for this transformer, multiply output | ||
if weight is None: | ||
return res | ||
return res * weight | ||
|
||
|
||
def _fit_transform_one(transformer, X, y, weight, **fit_params): | ||
if hasattr(transformer, 'fit_transform'): | ||
res = transformer.fit_transform(X, y, **fit_params) | ||
else: | ||
res = transformer.fit(X, y, **fit_params).transform(X) | ||
# if we have a weight for this transformer, multiply output | ||
if weight is None: | ||
return res, transformer | ||
return res * weight, transformer | ||
|
||
|
||
class FeatureUnion(_BaseComposition, TransformerMixin): | ||
"""Concatenates results of multiple transformer objects. | ||
|
||
|
@@ -651,6 +654,10 @@ class FeatureUnion(_BaseComposition, TransformerMixin): | |
array([[ 1.5 , 3.0..., 0.8...], | ||
[-1.5 , 5.7..., -0.4...]]) | ||
""" | ||
_fit_one_transformer = staticmethod(_fit_one_transformer) | ||
_fit_transform_one = staticmethod(_fit_transform_one) | ||
_transform_one = staticmethod(_transform_one) | ||
|
||
def __init__(self, transformer_list, n_jobs=1, transformer_weights=None): | ||
self.transformer_list = transformer_list | ||
self.n_jobs = n_jobs | ||
|
@@ -685,6 +692,15 @@ def set_params(self, **kwargs): | |
self._set_params('transformer_list', **kwargs) | ||
return self | ||
|
||
def _validate_one_transformer(self, t): | ||
if t is None: | ||
return | ||
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not | ||
hasattr(t, "transform")): | ||
raise TypeError("All estimators should implement fit and " | ||
"transform. '%s' (type %s) doesn't" % | ||
(t, type(t))) | ||
|
||
def _validate_transformers(self): | ||
names, transformers = zip(*self.transformer_list) | ||
|
||
|
@@ -693,13 +709,7 @@ def _validate_transformers(self): | |
|
||
# validate estimators | ||
for t in transformers: | ||
if t is None: | ||
continue | ||
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not | ||
hasattr(t, "transform")): | ||
raise TypeError("All estimators should implement fit and " | ||
"transform. '%s' (type %s) doesn't" % | ||
(t, type(t))) | ||
self._validate_one_transformer(t) | ||
|
||
def _iter(self): | ||
""" | ||
|
@@ -747,11 +757,19 @@ def fit(self, X, y=None): | |
self.transformer_list = list(self.transformer_list) | ||
self._validate_transformers() | ||
transformers = Parallel(n_jobs=self.n_jobs)( | ||
delayed(_fit_one_transformer)(trans, X, y) | ||
delayed(self._fit_one_transformer)(trans, X, y) | ||
for _, trans, _ in self._iter()) | ||
self._update_transformer_list(transformers) | ||
return self | ||
|
||
@staticmethod | ||
def _stack_arrays(Xs): | ||
if any(sparse.issparse(f) for f in Xs): | ||
Xs = sparse.hstack(Xs).tocsr() | ||
else: | ||
Xs = np.hstack(Xs) | ||
return Xs | ||
|
||
def fit_transform(self, X, y=None, **fit_params): | ||
"""Fit all transformers, transform the data and concatenate results. | ||
|
||
|
@@ -771,20 +789,17 @@ def fit_transform(self, X, y=None, **fit_params): | |
""" | ||
self._validate_transformers() | ||
result = Parallel(n_jobs=self.n_jobs)( | ||
delayed(_fit_transform_one)(trans, X, y, weight, | ||
**fit_params) | ||
delayed(self._fit_transform_one)(trans, X, y, weight, | ||
**fit_params) | ||
for name, trans, weight in self._iter()) | ||
|
||
if not result: | ||
# All transformers are None | ||
return np.zeros((X.shape[0], 0)) | ||
Xs, transformers = zip(*result) | ||
self._update_transformer_list(transformers) | ||
if any(sparse.issparse(f) for f in Xs): | ||
Xs = sparse.hstack(Xs).tocsr() | ||
else: | ||
Xs = np.hstack(Xs) | ||
return Xs | ||
|
||
return self._stack_arrays(Xs) | ||
|
||
def transform(self, X): | ||
"""Transform X separately by each transformer, concatenate results. | ||
|
@@ -801,16 +816,14 @@ def transform(self, X): | |
sum of n_components (output dimension) over transformers. | ||
""" | ||
Xs = Parallel(n_jobs=self.n_jobs)( | ||
delayed(_transform_one)(trans, X, None, weight) | ||
delayed(self._transform_one)(trans, X, None, weight) | ||
for name, trans, weight in self._iter()) | ||
|
||
if not Xs: | ||
# All transformers are None | ||
return np.zeros((X.shape[0], 0)) | ||
if any(sparse.issparse(f) for f in Xs): | ||
Xs = sparse.hstack(Xs).tocsr() | ||
else: | ||
Xs = np.hstack(Xs) | ||
return Xs | ||
|
||
return self._stack_arrays(Xs) | ||
|
||
def _update_transformer_list(self, transformers): | ||
transformers = iter(transformers) | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure there is really a need to factorize the code into this additional function. Here we only have 2 repetitions with little code, and a rule of thumb (source forgotten) is to factorize after 3..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looking at the code base right now I'd agree with you, but it won't hurt to keep it as a static method (and I use this as well 😬 )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but if it's only used in
_transform_one
and_fit_transform_one
you could always overwrite those two functions and define_apply_weight
as need, can't you? (or copy this function into your package).Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could, but what would be the benefit of doing so? Another benefit from wrapping snippets in functions, besides reusability, is readability. This way you don't need a comment describing the purpose of a snippet