Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] Refactor pipeline namespace to make it more reusable #11446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 61 additions & 48 deletions sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,33 @@
__all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union']


# weight and fit_params are not used but it allows _fit_one_transformer,
# _transform_one and _fit_transform_one to have the same signature to
# factorize the code in ColumnTransformer
def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
return transformer.fit(X, y)


def _apply_weight(Xt, weight=None):
# if we have a weight for this transformer, multiply output
if weight is None:
return Xt
return Xt * weight
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure there is really a need to factorize the code into this additional function. Here we only have 2 repetitions with little code, and a rule of thumb (source forgotten) is to factorize after 3..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looking at the code base right now I'd agree with you, but it won't hurt to keep it as a static method (and I use this as well 😬 )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but if it's only used in _transform_one and _fit_transform_one you could always overwrite those two functions and define _apply_weight as need, can't you? (or copy this function into your package).

Copy link
Contributor Author

@caioaao caioaao Jul 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could, but what would be the benefit of doing so? Another benefit from wrapping snippets in functions, besides reusability, is readability. This way you don't need a comment describing the purpose of a snippet



def _transform_one(transformer, X, y, weight, **fit_params):
res = transformer.transform(X)
return _apply_weight(res, weight)


def _fit_transform_one(transformer, X, y, weight, **fit_params):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It suppose it doesn't work if you leave _fit_transform_one in the same place as where it was before?

if hasattr(transformer, 'fit_transform'):
res = transformer.fit_transform(X, y, **fit_params)
else:
res = transformer.fit(X, y, **fit_params).transform(X)
return _apply_weight(res, weight), transformer


class Pipeline(_BaseComposition):
"""Pipeline of transforms with a final estimator.

Expand Down Expand Up @@ -112,6 +139,8 @@ class Pipeline(_BaseComposition):
False, False])
"""

_fit_transform_one = staticmethod(_fit_transform_one)

# BaseEstimator interface

def __init__(self, steps, memory=None):
Expand Down Expand Up @@ -194,7 +223,7 @@ def _fit(self, X, y=None, **fit_params):
# Setup the memory
memory = check_memory(self.memory)

fit_transform_one_cached = memory.cache(_fit_transform_one)
fit_transform_one_cached = memory.cache(self._fit_transform_one)

fit_params_steps = dict((name, {}) for name, step in self.steps
if step is not None)
Expand Down Expand Up @@ -582,32 +611,6 @@ def make_pipeline(*steps, **kwargs):
return Pipeline(_name_estimators(steps), memory=memory)


# weight and fit_params are not used but it allows _fit_one_transformer,
# _transform_one and _fit_transform_one to have the same signature to
# factorize the code in ColumnTransformer
def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
return transformer.fit(X, y)


def _transform_one(transformer, X, y, weight, **fit_params):
res = transformer.transform(X)
# if we have a weight for this transformer, multiply output
if weight is None:
return res
return res * weight


def _fit_transform_one(transformer, X, y, weight, **fit_params):
if hasattr(transformer, 'fit_transform'):
res = transformer.fit_transform(X, y, **fit_params)
else:
res = transformer.fit(X, y, **fit_params).transform(X)
# if we have a weight for this transformer, multiply output
if weight is None:
return res, transformer
return res * weight, transformer


class FeatureUnion(_BaseComposition, TransformerMixin):
"""Concatenates results of multiple transformer objects.

Expand Down Expand Up @@ -651,6 +654,10 @@ class FeatureUnion(_BaseComposition, TransformerMixin):
array([[ 1.5 , 3.0..., 0.8...],
[-1.5 , 5.7..., -0.4...]])
"""
_fit_one_transformer = staticmethod(_fit_one_transformer)
_fit_transform_one = staticmethod(_fit_transform_one)
_transform_one = staticmethod(_transform_one)

def __init__(self, transformer_list, n_jobs=1, transformer_weights=None):
self.transformer_list = transformer_list
self.n_jobs = n_jobs
Expand Down Expand Up @@ -685,6 +692,15 @@ def set_params(self, **kwargs):
self._set_params('transformer_list', **kwargs)
return self

def _validate_one_transformer(self, t):
if t is None:
return
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
hasattr(t, "transform")):
raise TypeError("All estimators should implement fit and "
"transform. '%s' (type %s) doesn't" %
(t, type(t)))

def _validate_transformers(self):
names, transformers = zip(*self.transformer_list)

Expand All @@ -693,13 +709,7 @@ def _validate_transformers(self):

# validate estimators
for t in transformers:
if t is None:
continue
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
hasattr(t, "transform")):
raise TypeError("All estimators should implement fit and "
"transform. '%s' (type %s) doesn't" %
(t, type(t)))
self._validate_one_transformer(t)

def _iter(self):
"""
Expand Down Expand Up @@ -747,11 +757,19 @@ def fit(self, X, y=None):
self.transformer_list = list(self.transformer_list)
self._validate_transformers()
transformers = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_one_transformer)(trans, X, y)
delayed(self._fit_one_transformer)(trans, X, y)
for _, trans, _ in self._iter())
self._update_transformer_list(transformers)
return self

@staticmethod
def _stack_arrays(Xs):
if any(sparse.issparse(f) for f in Xs):
Xs = sparse.hstack(Xs).tocsr()
else:
Xs = np.hstack(Xs)
return Xs

def fit_transform(self, X, y=None, **fit_params):
"""Fit all transformers, transform the data and concatenate results.

Expand All @@ -771,20 +789,17 @@ def fit_transform(self, X, y=None, **fit_params):
"""
self._validate_transformers()
result = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_transform_one)(trans, X, y, weight,
**fit_params)
delayed(self._fit_transform_one)(trans, X, y, weight,
**fit_params)
for name, trans, weight in self._iter())

if not result:
# All transformers are None
return np.zeros((X.shape[0], 0))
Xs, transformers = zip(*result)
self._update_transformer_list(transformers)
if any(sparse.issparse(f) for f in Xs):
Xs = sparse.hstack(Xs).tocsr()
else:
Xs = np.hstack(Xs)
return Xs

return self._stack_arrays(Xs)

def transform(self, X):
"""Transform X separately by each transformer, concatenate results.
Expand All @@ -801,16 +816,14 @@ def transform(self, X):
sum of n_components (output dimension) over transformers.
"""
Xs = Parallel(n_jobs=self.n_jobs)(
delayed(_transform_one)(trans, X, None, weight)
delayed(self._transform_one)(trans, X, None, weight)
for name, trans, weight in self._iter())

if not Xs:
# All transformers are None
return np.zeros((X.shape[0], 0))
if any(sparse.issparse(f) for f in Xs):
Xs = sparse.hstack(Xs).tocsr()
else:
Xs = np.hstack(Xs)
return Xs

return self._stack_arrays(Xs)

def _update_transformer_list(self, transformers):
transformers = iter(transformers)
Expand Down