diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index f07274f8fc..f899ac7119 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -195,3 +195,23 @@ def fit_transform( y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> bpd.DataFrame: return self.fit(X, y).transform(X) + + +class LabelTransformer(BaseEstimator): + """A BigQuery DataFrames Label Transformer base class that transforms data. + + Also the transformers can be attached to a pipeline with a predictor.""" + + @abc.abstractmethod + def fit(self, y): + pass + + @abc.abstractmethod + def transform(self, y): + pass + + def fit_transform( + self, + y: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: + return self.fit(y).transform(y) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index f4f5446651..ed0b36deef 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -315,7 +315,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: class LabelEncoder( - base.Transformer, + base.LabelTransformer, third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax @@ -401,16 +401,15 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y=None, # ignored + y: Union[bpd.DataFrame, bpd.Series], ) -> LabelEncoder: - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._compile_to_sql(y.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] self._bqml_model = self._bqml_model_factory.create_model( - X, + y, options={"model_type": "transform_only"}, transforms=transform_sqls, ) @@ -419,13 +418,13 @@ def fit( self._output_names = [name for _, name in compiled_transforms] return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - df = self._bqml_model.transform(X) + df = self._bqml_model.transform(y) return typing.cast( bpd.DataFrame, df[self._output_names], diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 7779eb8f6e..61bddb144d 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -357,9 +357,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ def test_label_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df["species"]) - result = encoder.transform(new_penguins_df).to_pandas() + result = encoder.transform(new_penguins_df["species"]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -368,11 +368,6 @@ def test_label_encoder_default_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -389,7 +384,7 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -398,11 +393,6 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -444,7 +434,7 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -455,11 +445,6 @@ def test_label_encoder_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 0, - 0, - 0, - ], "labelencoded_species": [ 0, 0, @@ -475,7 +460,7 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(penguins_df_default_index[["species", "sex"]]) + encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -486,11 +471,6 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df expected = pd.DataFrame( { - "labelencoded_sex": [ - 3, - 2, - 2, - ], "labelencoded_species": [ 1, 1, diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 7e60c846d4..83f8eb0f9c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -28,11 +28,11 @@ class LabelEncoder(BaseEstimator): Default None, set limit to 1,000,000. """ - def fit(self, X): - """Fit LabelEncoder to X. + def fit(self, y): + """Fit label encoder. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series with training data. Returns: @@ -40,11 +40,11 @@ def fit(self, X): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def transform(self, X): - """Transform X using label encoding. + def transform(self, y): + """Transform y using label encoding. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series to be transformed. Returns: