From a33a66e2796b26a8b09684c75b498b2f537cf637 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 21 Sep 2023 23:18:17 +0000 Subject: [PATCH] feat: add ml.preprocessing.LabelEncoder --- bigframes/ml/compose.py | 1 + bigframes/ml/pipeline.py | 15 +- bigframes/ml/preprocessing.py | 119 ++++++++++++++ bigframes/ml/sql.py | 11 ++ tests/system/large/ml/test_pipeline.py | 61 ++++++- tests/system/small/ml/test_preprocessing.py | 149 ++++++++++++++++++ tests/unit/ml/test_compose.py | 3 + tests/unit/ml/test_sql.py | 11 +- .../sklearn/preprocessing/_label.py | 52 ++++++ 9 files changed, 415 insertions(+), 7 deletions(-) create mode 100644 third_party/bigframes_vendored/sklearn/preprocessing/_label.py diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 02365f261c..db5d8cf260 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -29,6 +29,7 @@ CompilablePreprocessorType = Union[ preprocessing.OneHotEncoder, preprocessing.StandardScaler, + preprocessing.LabelEncoder, ] diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index bff0bf36ad..71c21d565a 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -50,6 +50,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.LabelEncoder, ), ): self._transform = transform @@ -143,7 +144,11 @@ def _extract_as_column_transformer( transformers: List[ Tuple[ str, - Union[preprocessing.OneHotEncoder, preprocessing.StandardScaler], + Union[ + preprocessing.OneHotEncoder, + preprocessing.StandardScaler, + preprocessing.LabelEncoder, + ], Union[str, List[str]], ] ] = [] @@ -167,6 +172,13 @@ def _extract_as_column_transformer( *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.LABEL_ENCODER"): + transformers.append( + ( + "label_encoder", + *preprocessing.LabelEncoder._parse_from_sql(transform_sql), + ) + ) else: raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" @@ -181,6 +193,7 @@ def _merge_column_transformer( compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" transformers = column_transformer.transformers_ diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index cd4ae27b8c..6ee17751df 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -24,6 +24,7 @@ import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data import third_party.bigframes_vendored.sklearn.preprocessing._encoder +import third_party.bigframes_vendored.sklearn.preprocessing._label class StandardScaler( @@ -229,3 +230,121 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + +class LabelEncoder( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, +): + # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax + TOP_K_DEFAULT = 1000000 + FREQUENCY_THRESHOLD_DEFAULT = 0 + + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder.__doc__ + ) + + # All estimators must implement __init__ to document their parameters, even + # if they don't have any + def __init__( + self, + min_frequency: Optional[int] = None, + max_categories: Optional[int] = None, + ): + if max_categories is not None and max_categories < 2: + raise ValueError( + f"max_categories has to be larger than or equal to 2, input is {max_categories}." + ) + self.min_frequency = min_frequency + self.max_categories = max_categories + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is LabelEncoder + and self._bqml_model == other._bqml_model + and self.min_frequency == other.min_frequency + and self.max_categories == other.max_categories + ) + + def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform + + Returns: a list of tuples of (sql_expression, output_name)""" + + # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. + top_k = ( + (self.max_categories - 1) + if self.max_categories is not None + else LabelEncoder.TOP_K_DEFAULT + ) + frequency_threshold = ( + self.min_frequency + if self.min_frequency is not None + else LabelEncoder.FREQUENCY_THRESHOLD_DEFAULT + ) + return [ + ( + self._base_sql_generator.ml_label_encoder( + column, top_k, frequency_threshold, f"labelencoded_{column}" + ), + f"labelencoded_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: + """Parse SQL to tuple(LabelEncoder, column_label). + + Args: + sql: SQL string of format "ML.LabelEncoder({col_label}, {top_k}, {frequency_threshold}) OVER() " + + Returns: + tuple(LabelEncoder, column_label)""" + s = sql[sql.find("(") + 1 : sql.find(")")] + col_label, top_k, frequency_threshold = s.split(", ") + max_categories = int(top_k) + 1 + min_frequency = int(frequency_threshold) + + return cls(min_frequency, max_categories), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> LabelEncoder: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 7cf030485b..5d35a10b96 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -88,6 +88,17 @@ def ml_one_hot_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_label_encoder( + self, + numeric_expr_sql: str, + top_k: int, + frequency_threshold: int, + name: str, + ) -> str: + """Encode ML.LABEL_ENCODER for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" + return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index bec1a51a99..87664b4c3d 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -570,6 +570,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "label", + preprocessing.LabelEncoder(), + "species", + ), ] ), ), @@ -632,6 +637,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "label", + preprocessing.LabelEncoder(), + "species", + ), ] ), ), @@ -650,7 +660,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id assert isinstance(pl_loaded._transform, compose.ColumnTransformer) transformers = pl_loaded._transform.transformers_ - assert len(transformers) == 3 + assert len(transformers) == 4 assert transformers[0][0] == "ont_hot_encoder" assert isinstance(transformers[0][1], preprocessing.OneHotEncoder) @@ -660,13 +670,20 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id assert one_hot_encoder.max_categories == 100 assert transformers[0][2] == "species" - assert transformers[1][0] == "standard_scaler" - assert isinstance(transformers[1][1], preprocessing.StandardScaler) - assert transformers[1][2] == "culmen_length_mm" + assert transformers[1][0] == "label_encoder" + assert isinstance(transformers[1][1], preprocessing.LabelEncoder) + one_hot_encoder = transformers[1][1] + assert one_hot_encoder.min_frequency == 0 + assert one_hot_encoder.max_categories == 1000001 + assert transformers[1][2] == "species" assert transformers[2][0] == "standard_scaler" assert isinstance(transformers[2][1], preprocessing.StandardScaler) - assert transformers[2][2] == "flipper_length_mm" + assert transformers[2][2] == "culmen_length_mm" + + assert transformers[3][0] == "standard_scaler" + assert isinstance(transformers[2][1], preprocessing.StandardScaler) + assert transformers[3][2] == "flipper_length_mm" assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False @@ -735,3 +752,37 @@ def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False + + +def test_pipeline_label_encoder_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ( + "transform", + preprocessing.LabelEncoder(min_frequency=5, max_categories=100), + ), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "sex", + "species", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_label_encoder", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.LabelEncoder) + + label_encoder = pl_loaded._transform + assert label_encoder.min_frequency == 5 + assert label_encoder.max_categories == 100 + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 57b9900c48..1f08ef2c2c 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -264,4 +264,153 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ pd.testing.assert_frame_equal(result, expected) +def test_label_encoder_default_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(new_penguins_df[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 2, + 1, + 1, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_default_params_fit_transform(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + + result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 2, + 1, + 1, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_series_default_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(new_penguins_df["species"]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) + encoder.fit(new_penguins_df[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 0, + 0, + 0, + ], + "labelencoded_species": [ + 0, + 0, + 0, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(penguins_df_default_index[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 3, + 2, + 2, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 772a148c95..c5b3b50876 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -23,10 +23,12 @@ def test_columntransformer_init_expectedtransforms(): onehot_transformer = bigframes.ml.preprocessing.OneHotEncoder() scaler_transformer = bigframes.ml.preprocessing.StandardScaler() + label_transformer = bigframes.ml.preprocessing.LabelEncoder() column_transformer = bigframes.ml.compose.ColumnTransformer( [ ("onehot", onehot_transformer, "species"), ("scale", scaler_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ("onehot", label_transformer, "species"), ] ) @@ -34,6 +36,7 @@ def test_columntransformer_init_expectedtransforms(): ("onehot", onehot_transformer, "species"), ("scale", scaler_transformer, "culmen_length_mm"), ("scale", scaler_transformer, "flipper_length_mm"), + ("onehot", label_transformer, "species"), ] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 495e8759e8..23b68aa150 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -51,12 +51,14 @@ def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenera sql = base_sql_generator.transform( "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", + "ML.LABEL_ENCODER(col_c) OVER(col_c) AS encoded_col_c", ) assert ( sql == """TRANSFORM( ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a, - ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b)""" + ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b, + ML.LABEL_ENCODER(col_c) OVER(col_c) AS encoded_col_c)""" ) @@ -78,6 +80,13 @@ def test_one_hot_encoder_produces_correct_sql( ) +def test_label_encoder_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a") + assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" + + def test_create_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py new file mode 100644 index 0000000000..7e60c846d4 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -0,0 +1,52 @@ +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Olivier Grisel +# Andreas Mueller +# Joel Nothman +# Hamzeh Alsalhi +# License: BSD 3 clause + +from bigframes import constants +from third_party.bigframes_vendored.sklearn.base import BaseEstimator + + +class LabelEncoder(BaseEstimator): + """Encode target labels with value between 0 and n_classes-1. + + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + + Args: + min_frequency (Optional[int], default None): + Specifies the minimum frequency below which a category will be considered infrequent. + Default None. + int: categories with a smaller cardinality will be considered infrequent as ßindex 0. + max_categories (Optional[int], default None): + Specifies an upper limit to the number of output features for each input feature + when considering infrequent categories. If there are infrequent categories, + max_categories includes the category representing the infrequent categories along with the frequent categories. + Default None, set limit to 1,000,000. + """ + + def fit(self, X): + """Fit LabelEncoder to X. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series with training data. + + Returns: + LabelEncoder: Fitted encoder. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Transform X using label encoding. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: The result is an array-like of values.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)