From a8576f9f24b7a7a7c9427858682dc69bdd7bd7e2 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 31 Oct 2023 17:15:38 +0000 Subject: [PATCH 1/8] test: add code snippets for using bigframes.ml --- .../snippets/clean_and_prep_ml_data_test.py | 42 +++++++++++++++ samples/snippets/clustering_model_test.py | 34 ++++++++++++ samples/snippets/gen_ai_model_test.py | 39 ++++++++++++++ samples/snippets/regression_model_test.py | 53 +++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 samples/snippets/clean_and_prep_ml_data_test.py create mode 100644 samples/snippets/clustering_model_test.py create mode 100644 samples/snippets/gen_ai_model_test.py create mode 100644 samples/snippets/regression_model_test.py diff --git a/samples/snippets/clean_and_prep_ml_data_test.py b/samples/snippets/clean_and_prep_ml_data_test.py new file mode 100644 index 0000000000..8aefb437c8 --- /dev/null +++ b/samples/snippets/clean_and_prep_ml_data_test.py @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_clean_and_prep_ml_data(): + import bigframes.pandas as bpd + + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # [START bigquery_dataframes_clean_and_prep_data] + # Filter down to the data to the Adelie Penguin species + adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # Drop the species column + adelie_data = adelie_data.drop(columns=["species"]) + + # Drop rows with nulls to get training data + training_data = adelie_data.dropna() + + # Specify your feature (or input) columns and the label (or output) column: + feature_columns = training_data[ + ["island", "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "sex"] + ] + label_columns = training_data[["body_mass_g"]] + + test_data = adelie_data[adelie_data.body_mass_g.isnull()] + # [END bigquery_dataframes_clean_and_prep_data] + assert test_data is not None + assert feature_columns is not None + assert label_columns is not None diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/clustering_model_test.py new file mode 100644 index 0000000000..6d127c9005 --- /dev/null +++ b/samples/snippets/clustering_model_test.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_clustering_model(): + from bigframes.ml.cluster import KMeans + import bigframes.pandas as bpd + + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + # [START bigquery_dataframes_clustering_model] + # Create the KMeans model + cluster_model = KMeans(n_clusters=10) + cluster_model.fit(bq_df["culmen_length_mm"], bq_df["sex"]) + + # Predict using the model + result = cluster_model.predict(bq_df) + # Score the model + score = cluster_model.score(bq_df) + # [END bigquery_dataframes_clustering_model] + assert result is not None + assert score is not None diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py new file mode 100644 index 0000000000..7cbc90d4c0 --- /dev/null +++ b/samples/snippets/gen_ai_model_test.py @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_llm_model(): + PROJECT_ID = "bigframes-dev" + REGION = "us" + CONN_NAME = "bigframes-ml" + # [START bigquery_dataframes_gen_ai_model] + from bigframes.ml.llm import PaLM2TextGenerator + import bigframes.pandas as bpd + + # Create the LLM model + session = bpd.get_global_session() + connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" + model = PaLM2TextGenerator(session=session, connection_name=connection) + + df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") + + # Prepare the prompts and send them to the LLM model for prediction + df_prompt_prefix = "Generate Pandas sample code for DataFrame." + df_prompt = df_prompt_prefix + df_api["API"] + + # Predict using the model + df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) + # [END bigquery_dataframes_gen_ai_model] + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/regression_model_test.py new file mode 100644 index 0000000000..fe65680b44 --- /dev/null +++ b/samples/snippets/regression_model_test.py @@ -0,0 +1,53 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_regression_model(): + import bigframes.pandas as bpd + + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Filter down to the data to the Adelie Penguin species + adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # Drop the species column + adelie_data = adelie_data.drop(columns=["species"]) + + # Drop rows with nulls to get training data + training_data = adelie_data.dropna() + + # Specify your feature (or input) columns and the label (or output) column: + feature_columns = training_data[ + ["island", "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "sex"] + ] + label_columns = training_data[["body_mass_g"]] + + test_data = adelie_data[adelie_data.body_mass_g.isnull()] + # [START bigquery_dataframes_regression_model] + from bigframes.ml.linear_model import LinearRegression + + model = LinearRegression() + + model.fit(feature_columns, label_columns) + + # Score the model + score = model.score(feature_columns, label_columns) + + # Predict using the model + result = model.predict(test_data) + # [END bigquery_dataframes_regression_model] + assert model is not None + assert score is not None + assert result is not None From aa442ce3c82ab2e577e93f3fc20873592143799f Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 31 Oct 2023 10:54:17 -0700 Subject: [PATCH 2/8] test: add code snippets for loading data from BigQuery Job (#154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add code snippets for loading data from BigQuery Job * fix: address the comments * fix: fix the broken test * use BigQuery Client library to get the job_id * feat: Implement operator `@` for `DataFrame.dot` (#139) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 🦕 * fix: fix the comments --------- Co-authored-by: Shobhit Singh --- .../load_data_from_biquery_job_test.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 samples/snippets/load_data_from_biquery_job_test.py diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py new file mode 100644 index 0000000000..5271574a49 --- /dev/null +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -0,0 +1,51 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery_job(): + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client(project="bigframes-dev", location="us") + + query = """ + SELECT * + FROM `bigquery-public-data.ml_datasets.penguins` + LIMIT 20 + """ + query_job = client.query(query) + JOB_ID = query_job.job_id + your_project_id = "bigframes-dev" + + # [START bigquery_dataframes_load_data_from_bigquery_job] + from google.cloud import bigquery + + import bigframes.pandas as bpd + + # Project ID inserted based on the query results selected to explore + project = your_project_id + # Location inserted based on the query results selected to explore + location = "us" + client = bigquery.Client(project=project, location=location) + + # Job ID inserted based on the query results selcted to explore + job_id = JOB_ID + job = client.get_job(job_id) + destination = str(job.destination) + + # Load data from a BigQuery table using BigFrames DataFrames: + bq_df = bpd.read_gbq_table(destination) + + # [END bigquery_dataframes_load_data_from_bigquery_job] + assert bq_df is not None From 14bf68d0338612fd3a99706c4ca1d635937b1188 Mon Sep 17 00:00:00 2001 From: Bradford Orr <15842009+orrbradford@users.noreply.github.com> Date: Tue, 31 Oct 2023 12:46:47 -0700 Subject: [PATCH 3/8] feat: add bigframes.options.compute.maximum_bytes_billed option that sets maximum bytes billed on query jobs (#133) -implement context manager for global options -maximum_bytes_billed only applies to query jobs. This limitation will be set per query. Operations that trigger multiple jobs may result in total usage beyond this setting --- bigframes/__init__.py | 3 +- bigframes/_config/__init__.py | 11 +++++ bigframes/_config/compute_options.py | 35 +++++++++++++++ bigframes/_config/display_options.py | 23 ++++------ bigframes/pandas/__init__.py | 4 ++ bigframes/session/__init__.py | 19 +++++--- docs/reference/bigframes/options.rst | 2 + docs/templates/toc.yml | 2 + tests/system/conftest.py | 7 --- tests/system/small/test_progress_bar.py | 17 ++++--- tests/unit/test_compute_options.py | 30 +++++++++++++ .../pandas/_config/config.py | 45 +++++++++++++++++++ 12 files changed, 162 insertions(+), 36 deletions(-) create mode 100644 bigframes/_config/compute_options.py create mode 100644 tests/unit/test_compute_options.py create mode 100644 third_party/bigframes_vendored/pandas/_config/config.py diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 8f41790072..bd1476957b 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,7 +14,7 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import options +from bigframes._config import option_context, options from bigframes._config.bigquery_options import BigQueryOptions from bigframes.core.global_session import close_session, get_global_session from bigframes.session import connect, Session @@ -28,4 +28,5 @@ "connect", "Session", "__version__", + "option_context", ] diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index e26eaf8800..8dcebfce6a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -18,8 +18,10 @@ """ import bigframes._config.bigquery_options as bigquery_options +import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options import bigframes._config.sampling_options as sampling_options +import third_party.bigframes_vendored.pandas._config.config as pandas_config class Options: @@ -29,6 +31,7 @@ def __init__(self): self._bigquery_options = bigquery_options.BigQueryOptions() self._display_options = display_options.DisplayOptions() self._sampling_options = sampling_options.SamplingOptions() + self._compute_options = compute_options.ComputeOptions() @property def bigquery(self) -> bigquery_options.BigQueryOptions: @@ -49,6 +52,11 @@ def sampling(self) -> sampling_options.SamplingOptions: parameters in specific functions.""" return self._sampling_options + @property + def compute(self) -> compute_options.ComputeOptions: + """Options controlling object computation.""" + return self._compute_options + options = Options() """Global options for default session.""" @@ -58,3 +66,6 @@ def sampling(self) -> sampling_options.SamplingOptions: "Options", "options", ) + + +option_context = pandas_config.option_context diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py new file mode 100644 index 0000000000..20c31d3906 --- /dev/null +++ b/bigframes/_config/compute_options.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Options for displaying objects.""" + +import dataclasses +from typing import Optional + + +@dataclasses.dataclass +class ComputeOptions: + """ + Encapsulates configuration for compute options. + + Attributes: + maximum_bytes_billed (int, Options): + Limits the bytes billed for query jobs. Queries that will have + bytes billed beyond this limit will fail (without incurring a + charge). If unspecified, this will be set to your project default. + See `maximum_bytes_billed `_. + + """ + + maximum_bytes_billed: Optional[int] = None diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 8bd2743f17..ad3ea3f68c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -40,17 +40,12 @@ def pandas_repr(display_options: DisplayOptions): This context manager makes sure we reset the pandas options when we're done so that we don't override pandas behavior. """ - original_max_cols = pd.options.display.max_columns - original_max_rows = pd.options.display.max_rows - original_show_dimensions = pd.options.display.show_dimensions - - pd.options.display.max_columns = display_options.max_columns - pd.options.display.max_rows = display_options.max_rows - pd.options.display.show_dimensions = True # type: ignore - - try: - yield - finally: - pd.options.display.max_columns = original_max_cols - pd.options.display.max_rows = original_max_rows - pd.options.display.show_dimensions = original_show_dimensions + with pd.option_context( + "display.max_columns", + display_options.max_columns, + "display.max_rows", + display_options.max_rows, + "display.show_dimensions", + True, + ) as pandas_context: + yield (pandas_context) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 8d9726312f..0fab1109dc 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -462,6 +462,9 @@ def read_gbq_function(function_name: str): options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" +option_context = config.option_context +"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session @@ -494,6 +497,7 @@ def read_gbq_function(function_name: str): # Other public pandas attributes "NamedAgg", "options", + "option_context", # Session management APIs "get_global_session", "close_session", diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 932a41f283..4858c7726a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1504,12 +1504,10 @@ def _start_query( max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts query job and waits for results + Starts query job and waits for results. """ - if job_config is not None: - query_job = self.bqclient.query(sql, job_config=job_config) - else: - query_job = self.bqclient.query(sql) + job_config = self._prepare_job_config(job_config) + query_job = self.bqclient.query(sql, job_config=job_config) opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: @@ -1538,6 +1536,17 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + def _prepare_job_config( + self, job_config: Optional[bigquery.QueryJobConfig] = None + ) -> bigquery.QueryJobConfig: + if job_config is None: + job_config = self.bqclient.default_query_job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + return job_config + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index d831a519fe..991399eb88 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -12,3 +12,5 @@ Options and settings .. autoclass:: bigframes._config.display_options.DisplayOptions .. autoclass:: bigframes._config.sampling_options.SamplingOptions + +.. autoclass:: bigframes._config.compute_options.ComputeOptions diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 4fe2ec1a6a..9879721d28 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -13,6 +13,8 @@ uid: bigframes._config.display_options.DisplayOptions - name: SamplingOptions uid: bigframes._config.sampling_options.SamplingOptions + - name: ComputeOptions + uid: bigframes._config.compute_options.ComputeOptions name: Options and settings - items: - name: Session diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 8885b03d34..f9f69c6c8e 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -898,13 +898,6 @@ def usa_names_grouped_table( return session.bqclient.get_table(table_id) -@pytest.fixture() -def deferred_repr(): - bigframes.options.display.repr_mode = "deferred" - yield - bigframes.options.display.repr_mode = "head" - - @pytest.fixture() def restore_sampling_settings(): enable_downsampling = bigframes.options.sampling.enable_downsampling diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 084b723fba..30ea63b483 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -135,12 +135,11 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): assert string in query_job_repr -def test_query_job_dry_run( - penguins_df_default_index: bf.dataframe.DataFrame, capsys, deferred_repr -): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line +def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): + with bf.option_context("display.repr_mode", "deferred"): + repr(penguins_df_default_index) + repr(penguins_df_default_index["body_mass_g"]) + lines = capsys.readouterr().out.split("\n") + lines = filter(None, lines) + for line in lines: + assert "Computation deferred. Computation will process" in line diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py new file mode 100644 index 0000000000..499a0a5fef --- /dev/null +++ b/tests/unit/test_compute_options.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import bigframes as bf + +from . import resources + + +def test_maximum_bytes_option(): + session = resources.create_bigquery_session() + num_query_calls = 0 + with bf.option_context("compute.maximum_bytes_billed", 10000): + # clear initial method calls + session.bqclient.method_calls = [] + session._start_query("query") + for call in session.bqclient.method_calls: + _, _, kwargs = call + num_query_calls += 1 + assert kwargs["job_config"].maximum_bytes_billed == 10000 + assert num_query_calls > 0 diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py new file mode 100644 index 0000000000..8abaca76c7 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -0,0 +1,45 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/_config/config.py +import contextlib +import operator + +import bigframes + + +class option_context(contextlib.ContextDecorator): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + >>> import bigframes + >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass + """ + + def __init__(self, *args) -> None: + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self) -> None: + self.undo = [ + (pat, operator.attrgetter(pat)(bigframes.options)) for pat, val in self.ops + ] + + for pat, val in self.ops: + self._set_option(pat, val) + + def __exit__(self, *args) -> None: + if self.undo: + for pat, val in self.undo: + self._set_option(pat, val) + + def _set_option(self, pat, val): + root, attr = pat.rsplit(".", 1) + parent = operator.attrgetter(root)(bigframes.options) + setattr(parent, attr, val) From 9a42f039ccb2c207637d46e876bcfb8a5db00046 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 31 Oct 2023 18:00:20 -0500 Subject: [PATCH 4/8] docs: fix indentation on `read_gbq_function` code sample (#163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/session/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4858c7726a..5a61ed534f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1467,13 +1467,13 @@ def read_gbq_function( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None - >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" - >>> func = bpd.read_gbq_function(function_name=function_name) - >>> func.bigframes_remote_function - 'bqutil.fn.cw_lower_case_ascii_only' + >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" + >>> func = bpd.read_gbq_function(function_name=function_name) + >>> func.bigframes_remote_function + 'bqutil.fn.cw_lower_case_ascii_only' Args: function_name (str): From fdeb9f293cd56796bb3aa4224a6b99b06dbc61c7 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 31 Oct 2023 18:36:09 +0000 Subject: [PATCH 5/8] move ml samples to e2e, make the tests standalone --- noxfile.py | 2 +- .../snippets/clean_and_prep_ml_data_test.py | 42 ------------------- .../{ => ml_samples}/clustering_model_test.py | 3 +- .../{ => ml_samples}/gen_ai_model_test.py | 0 .../{ => ml_samples}/regression_model_test.py | 10 +++-- 5 files changed, 10 insertions(+), 47 deletions(-) delete mode 100644 samples/snippets/clean_and_prep_ml_data_test.py rename samples/snippets/{ => ml_samples}/clustering_model_test.py (99%) rename samples/snippets/{ => ml_samples}/gen_ai_model_test.py (100%) rename samples/snippets/{ => ml_samples}/regression_model_test.py (91%) diff --git a/noxfile.py b/noxfile.py index d0bbda80fd..602b17291d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -377,7 +377,7 @@ def e2e(session: nox.sessions.Session): run_system( session=session, prefix_name="e2e", - test_folder=os.path.join("tests", "system", "large"), + test_folder=[os.path.join("tests", "system", "large")], print_duration=True, ) diff --git a/samples/snippets/clean_and_prep_ml_data_test.py b/samples/snippets/clean_and_prep_ml_data_test.py deleted file mode 100644 index 8aefb437c8..0000000000 --- a/samples/snippets/clean_and_prep_ml_data_test.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_clean_and_prep_ml_data(): - import bigframes.pandas as bpd - - query_or_table = "bigquery-public-data.ml_datasets.penguins" - bq_df = bpd.read_gbq(query_or_table) - - # [START bigquery_dataframes_clean_and_prep_data] - # Filter down to the data to the Adelie Penguin species - adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"] - - # Drop the species column - adelie_data = adelie_data.drop(columns=["species"]) - - # Drop rows with nulls to get training data - training_data = adelie_data.dropna() - - # Specify your feature (or input) columns and the label (or output) column: - feature_columns = training_data[ - ["island", "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "sex"] - ] - label_columns = training_data[["body_mass_g"]] - - test_data = adelie_data[adelie_data.body_mass_g.isnull()] - # [END bigquery_dataframes_clean_and_prep_data] - assert test_data is not None - assert feature_columns is not None - assert label_columns is not None diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/ml_samples/clustering_model_test.py similarity index 99% rename from samples/snippets/clustering_model_test.py rename to samples/snippets/ml_samples/clustering_model_test.py index 6d127c9005..a407fc7805 100644 --- a/samples/snippets/clustering_model_test.py +++ b/samples/snippets/ml_samples/clustering_model_test.py @@ -14,13 +14,14 @@ def test_clustering_model(): + # [START bigquery_dataframes_clustering_model] from bigframes.ml.cluster import KMeans import bigframes.pandas as bpd # Load data from BigQuery query_or_table = "bigquery-public-data.ml_datasets.penguins" bq_df = bpd.read_gbq(query_or_table) - # [START bigquery_dataframes_clustering_model] + # Create the KMeans model cluster_model = KMeans(n_clusters=10) cluster_model.fit(bq_df["culmen_length_mm"], bq_df["sex"]) diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/ml_samples/gen_ai_model_test.py similarity index 100% rename from samples/snippets/gen_ai_model_test.py rename to samples/snippets/ml_samples/gen_ai_model_test.py diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/ml_samples/regression_model_test.py similarity index 91% rename from samples/snippets/regression_model_test.py rename to samples/snippets/ml_samples/regression_model_test.py index fe65680b44..7d1bde689c 100644 --- a/samples/snippets/regression_model_test.py +++ b/samples/snippets/ml_samples/regression_model_test.py @@ -14,8 +14,11 @@ def test_regression_model(): + # [START bigquery_dataframes_regression_model] + from bigframes.ml.linear_model import LinearRegression import bigframes.pandas as bpd + # Load data from BigQuery query_or_table = "bigquery-public-data.ml_datasets.penguins" bq_df = bpd.read_gbq(query_or_table) @@ -35,11 +38,9 @@ def test_regression_model(): label_columns = training_data[["body_mass_g"]] test_data = adelie_data[adelie_data.body_mass_g.isnull()] - # [START bigquery_dataframes_regression_model] - from bigframes.ml.linear_model import LinearRegression + # Create the linear model model = LinearRegression() - model.fit(feature_columns, label_columns) # Score the model @@ -48,6 +49,9 @@ def test_regression_model(): # Predict using the model result = model.predict(test_data) # [END bigquery_dataframes_regression_model] + assert test_data is not None + assert feature_columns is not None + assert label_columns is not None assert model is not None assert score is not None assert result is not None From 32e03ccb840fb6d7a7e4d69ac9c30250e9440863 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Wed, 1 Nov 2023 13:41:27 -0700 Subject: [PATCH 6/8] feat: add pd.get_dummies (#149) * feat: add pd.get_dummies * remove unneeded prefix case * param/documentation fixes * be stricter about types in test * be stricter about types in series test * remove unneeded comment * adjust for type difference in pandas 1 * add example code (tested) * fix None columns and add test cases * variable names and _get_unique_values per-column * account for pandas 1 behavior difference * remove already_seen set * avoid unnecessary join/projection * fix column ordering edge case * adjust for picky examples checker * example tweak * make part of the example comments * use ellipsis in doctest comment * add to doctest string * extract parameter standardization * extract submethods --------- Co-authored-by: Henry J Solberg --- bigframes/pandas/__init__.py | 177 ++++++++++++++++++ tests/system/small/test_pandas.py | 112 +++++++++++ .../pandas/core/reshape/concat.py | 2 +- .../pandas/core/reshape/encoding.py | 119 ++++++++++++ .../pandas/core/reshape/merge.py | 1 - .../pandas/core/reshape/tile.py | 2 +- 6 files changed, 410 insertions(+), 3 deletions(-) create mode 100644 third_party/bigframes_vendored/pandas/core/reshape/encoding.py diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0fab1109dc..1c52b103fb 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -45,14 +45,18 @@ ) import bigframes._config as config +import bigframes.constants as constants +import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape import bigframes.dataframe +import bigframes.operations as ops import bigframes.series import bigframes.session import bigframes.session.clients import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -134,6 +138,179 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def get_dummies( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None] = None, + prefix_sep: Union[List, dict, str, None] = "_", + dummy_na: bool = False, + columns: Optional[List] = None, + drop_first: bool = False, + dtype: Any = None, +) -> DataFrame: + # simplify input parameters into per-input-label lists + # also raise errors for invalid parameters + column_labels, prefixes, prefix_seps = _standardize_get_dummies_params( + data, prefix, prefix_sep, columns, dtype + ) + + # combine prefixes into per-column-id list + full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels( + data, column_labels, prefix is not None, prefixes, prefix_seps + ) + + # run queries to compute unique values + block = data._block + max_unique_value = ( + bigframes.core.blocks._BQ_MAX_COLUMNS + - len(block.value_columns) + - len(block.index_columns) + - 1 + ) // len(column_labels) + columns_values = [ + block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids + ] + + # for each dummified column, add the content of the output columns via block operations + intermediate_col_ids = [] + for i in range(len(columns_values)): + level = columns_values[i].get_level_values(0).sort_values().dropna() + if drop_first: + level = level[1:] + column_label = full_columns_prefixes[i] + column_id = columns_ids[i] + block, new_intermediate_col_ids = _perform_get_dummies_block_operations( + block, level, column_label, column_id, dummy_na + ) + intermediate_col_ids.extend(new_intermediate_col_ids) + + # drop dummified columns (and the intermediate columns we added) + block = block.drop_columns(columns_ids + intermediate_col_ids) + return DataFrame(block) + + +get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__ + + +def _standardize_get_dummies_params( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None], + prefix_sep: Union[List, dict, str, None], + columns: Optional[List], + dtype: Any, +) -> Tuple[List, List[str], List[str]]: + block = data._block + + if isinstance(data, Series): + columns = [block.column_labels[0]] + if columns is not None and not pandas.api.types.is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + if dtype is not None and dtype not in [ + pandas.BooleanDtype, + bool, + "Boolean", + "boolean", + "bool", + ]: + raise NotImplementedError( + f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}" + ) + + if columns is None: + default_dummy_types = [pandas.StringDtype, "string[pyarrow]"] + columns = [] + columns_set = set() + for col_id in block.value_columns: + label = block.col_id_to_label[col_id] + if ( + label not in columns_set + and block.expr.get_column_type(col_id) in default_dummy_types + ): + columns.append(label) + columns_set.add(label) + + column_labels: List = typing.cast(List, columns) + + def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]: + if kwarg is None: + return None + if isinstance(kwarg, str): + return [kwarg] * len(column_labels) + if isinstance(kwarg, dict): + return [kwarg[column] for column in column_labels] + kwarg = typing.cast(List, kwarg) + if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels): + raise ValueError( + f"Length of '{kwarg_name}' ({len(kwarg)}) did not match " + f"the length of the columns being encoded ({len(column_labels)})." + ) + if pandas.api.types.is_list_like(kwarg): + return list(map(str, kwarg)) + raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary") + + prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep") + prefix_seps = typing.cast(List, prefix_seps) + prefixes = parse_prefix_kwarg(prefix, "prefix") + if prefixes is None: + prefixes = column_labels + prefixes = typing.cast(List, prefixes) + + return column_labels, prefixes, prefix_seps + + +def _determine_get_dummies_columns_from_labels( + data: Union[DataFrame, Series], + column_labels: List, + prefix_given: bool, + prefixes: List[str], + prefix_seps: List[str], +) -> Tuple[List[str], List[str]]: + block = data._block + + columns_ids = [] + columns_prefixes = [] + for i in range(len(column_labels)): + label = column_labels[i] + empty_prefix = label is None or (isinstance(data, Series) and not prefix_given) + full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i] + + for col_id in block.label_to_col_id[label]: + columns_ids.append(col_id) + columns_prefixes.append(full_prefix) + + return columns_prefixes, columns_ids + + +def _perform_get_dummies_block_operations( + block: bigframes.core.blocks.Block, + level: pandas.Index, + column_label: str, + column_id: str, + dummy_na: bool, +) -> Tuple[bigframes.core.blocks.Block, List[str]]: + intermediate_col_ids = [] + for value in level: + new_column_label = f"{column_label}{value}" + if column_label == "": + new_column_label = value + new_block, new_id = block.apply_unary_op( + column_id, ops.BinopPartialLeft(ops.eq_op, value) + ) + intermediate_col_ids.append(new_id) + block, _ = new_block.apply_unary_op( + new_id, + ops.BinopPartialRight(ops.fillna_op, False), + result_label=new_column_label, + ) + if dummy_na: + # dummy column name for na depends on the dtype + na_string = str(pandas.Index([None], dtype=level.dtype)[0]) + new_column_label = f"{column_label}{na_string}" + block, _ = block.apply_unary_op( + column_id, ops.isnull_op, result_label=new_column_label + ) + return block, intermediate_col_ids + + def qcut( x: bigframes.series.Series, q: int, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index f8fa78587f..0292ebd206 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -45,6 +45,118 @@ def test_concat_series(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("kwargs"), + [ + { + "prefix": ["prefix1", "prefix2"], + "prefix_sep": "_", + "dummy_na": None, + "columns": ["bool_col", "int64_col"], + "drop_first": False, + }, + { + "prefix": "prefix", + "prefix_sep": ["_", ","], + "dummy_na": False, + "columns": ["int64_too", "string_col"], + "drop_first": False, + }, + { + "prefix": None, + "prefix_sep": ".", + "dummy_na": True, + "columns": ["time_col", "float64_col"], + "drop_first": True, + }, + ], +) +def test_get_dummies_dataframe(scalars_dfs, kwargs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = bpd.get_dummies(scalars_df, **kwargs, dtype=bool) + pd_result = pd.get_dummies(scalars_pandas_df, **kwargs, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_dataframe_duplicate_labels(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("pandas has different behavior in 1.x") + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_renamed_df = scalars_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + scalars_renamed_pandas_df = scalars_pandas_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + + bf_result = bpd.get_dummies( + scalars_renamed_df, columns=["int64_col", None], dtype=bool + ) + pd_result = pd.get_dummies( + scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool + ) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col + pd_series = scalars_pandas_df.date_col + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_get_dummies_series_nameless(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col.rename(None) + pd_series = scalars_pandas_df.date_col.rename(None) + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + @pytest.mark.parametrize( ("how"), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 6e6d2d8b5c..b0472c524a 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/concat.py """ -Concat routines. +Concat routines """ from __future__ import annotations diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py new file mode 100644 index 0000000000..da92b58f50 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py @@ -0,0 +1,119 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/encoding.py +""" +Encoding routines +""" +from __future__ import annotations + +from bigframes import constants + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + drop_first=False, + dtype=None, +): + """ + Convert categorical variable into dummy/indicator variables. + + Each variable is converted in as many 0/1 variables as there are + different values. Columns in the output are each named after a value; + if the input is a DataFrame, the name of the original variable is + prepended to the value. + + **Examples:** + >>> import bigframes.pandas as pd + >>> pd.options.display.progress_bar = None + >>> s = pd.Series(list('abca')) + >>> pd.get_dummies(s) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + + [4 rows x 3 columns] + + >>> s1 = pd.Series(['a', 'b', None]) + >>> pd.get_dummies(s1) + a b + 0 True False + 1 False True + 2 False False + + [3 rows x 2 columns] + + >>> pd.get_dummies(s1, dummy_na=True) + a b + 0 True False False + 1 False True False + 2 False False True + + [3 rows x 3 columns] + + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) + >>> pd.get_dummies(df, prefix=['col1', 'col2']) + C col1_a col1_b col2_a col2_b col2_c + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True + + [3 rows x 6 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False + + [5 rows x 3 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False + + [5 rows x 2 columns] + + Args: + data (Series or DataFrame): + Data of which to get dummy indicators. + + prefix (str, list of str, or dict of str, default None): + String to append DataFrame column names. Pass a list with length + equal to the number of columns when calling get_dummies on a + DataFrame. Alternatively, prefix can be a dictionary mapping column + names to prefixes. + + prefix_sep (str, list of str, or dict of str, default '_'): + Separator/delimiter to use, appended to prefix. Or pass a list or + dictionary as with prefix. + + dummy_na (bool, default False): + Add a column to indicate NaNs, if False NaNs are ignored. + + columns (list-like, default None): + Column names in the DataFrame to be encoded. If columns is None + then only the columns with string dtype will be converted. + + drop_first (bool, default False): + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + + dtype (dtype, default bool): + Data type for new columns. Only a single dtype is allowed. + + Returns: + DataFrame: Dummy-coded data. If data contains other columns than the + dummy-coded one(s), these will be prepended, unaltered, to the + result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index cc81de405b..b03f366fca 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -16,7 +16,6 @@ def merge( sort=False, suffixes=("_x", "_y"), ): - """ Merge DataFrame objects with a database-style join. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 24ea655a5f..d4471ed68e 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/tile.py """ -Quantilization functions and related stuff +Quantilization functions and related routines """ from __future__ import annotations From a005bd1085adfb9c421fcbdfbf1cbbd269fc0e26 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 1 Nov 2023 21:14:41 +0000 Subject: [PATCH 7/8] fix: fix the failed test --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 602b17291d..d0bbda80fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -377,7 +377,7 @@ def e2e(session: nox.sessions.Session): run_system( session=session, prefix_name="e2e", - test_folder=[os.path.join("tests", "system", "large")], + test_folder=os.path.join("tests", "system", "large"), print_duration=True, ) From 7be45b643df45ae4990cc02c3c0100e2b872fd1e Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 1 Nov 2023 21:56:58 +0000 Subject: [PATCH 8/8] fix: reorganize the directory --- samples/snippets/{ml_samples => }/clustering_model_test.py | 0 samples/snippets/{ml_samples => }/gen_ai_model_test.py | 0 samples/snippets/{ml_samples => }/regression_model_test.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename samples/snippets/{ml_samples => }/clustering_model_test.py (100%) rename samples/snippets/{ml_samples => }/gen_ai_model_test.py (100%) rename samples/snippets/{ml_samples => }/regression_model_test.py (100%) diff --git a/samples/snippets/ml_samples/clustering_model_test.py b/samples/snippets/clustering_model_test.py similarity index 100% rename from samples/snippets/ml_samples/clustering_model_test.py rename to samples/snippets/clustering_model_test.py diff --git a/samples/snippets/ml_samples/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py similarity index 100% rename from samples/snippets/ml_samples/gen_ai_model_test.py rename to samples/snippets/gen_ai_model_test.py diff --git a/samples/snippets/ml_samples/regression_model_test.py b/samples/snippets/regression_model_test.py similarity index 100% rename from samples/snippets/ml_samples/regression_model_test.py rename to samples/snippets/regression_model_test.py