diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py new file mode 100644 index 0000000000..783f963feb --- /dev/null +++ b/samples/snippets/bqml_getting_started_test.py @@ -0,0 +1,93 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bqml_getting_started(random_model_id): + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_getting_started_tutorial] + from bigframes.ml.linear_model import LogisticRegression + import bigframes.pandas as bpd + + # Start by selecting the data you'll use for training. `read_gbq` accepts + # either a SQL query or a table ID. Since this example selects from multiple + # tables via a wildcard, use SQL to define this data. Watch issue + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + # for updates to `read_gbq` to support wildcard tables. + + df = bpd.read_gbq( + """ + -- Since the order of rows isn't useful for the model training, + -- generate a random ID to use as the index for the DataFrame. + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' + """, + index_col="rowindex", + ) + + # Extract the total number of transactions within + # the Google Analytics session. + # + # Because the totals column is a STRUCT data type, call + # Series.struct.field("transactions") to extract the transactions field. + # See the reference documentation below: + # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field + transactions = df["totals"].struct.field("transactions") + + # The "label" values represent the outcome of the model's + # prediction. In this case, the model predicts if there are any + # ecommerce transactions within the Google Analytics session. + # If the number of transactions is NULL, the value in the label + # column is set to 0. Otherwise, it is set to 1. + label = transactions.notnull().map({True: 1, False: 0}) + + # Extract the operating system of the visitor's device. + operatingSystem = df["device"].struct.field("operatingSystem") + operatingSystem = operatingSystem.fillna("") + + # Extract whether the visitor's device is a mobile device. + isMobile = df["device"].struct.field("isMobile") + + # Extract the country from which the sessions originated, based on the IP address. + country = df["geoNetwork"].struct.field("country").fillna("") + + # Extract the total number of page views within the session. + pageviews = df["totals"].struct.field("pageviews").fillna(0) + + # Combine all the feature columns into a single DataFrame + # to use as training data. + features = bpd.DataFrame( + { + "os": operatingSystem, + "is_mobile": isMobile, + "country": country, + "pageviews": pageviews, + } + ) + + # Logistic Regression model splits data into two classes, giving the + # a confidence score that the data is in one of the classes. + model = LogisticRegression() + model.fit(features, label) + + # The model.fit() call above created a temporary model. + # Use the to_gbq() method to write to a permanent location. + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.sample_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_getting_started_tutorial] diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py new file mode 100644 index 0000000000..1ce54b3c0c --- /dev/null +++ b/samples/snippets/conftest.py @@ -0,0 +1,66 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterator + +from google.cloud import bigquery +import pytest +import test_utils.prefixer + +prefixer = test_utils.prefixer.Prefixer( + "python-bigquery-dataframes", "samples/snippets" +) + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client) -> None: + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + +@pytest.fixture(scope="session") +def bigquery_client() -> bigquery.Client: + bigquery_client = bigquery.Client() + return bigquery_client + + +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client) -> str: + return bigquery_client.project + + +@pytest.fixture(scope="session") +def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture +def random_model_id( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +) -> Iterator[str]: + """Create a new table ID each time, so random_model_id can be used as + target for load jobs. + """ + random_model_id = prefixer.create_prefix() + full_model_id = f"{project_id}.{dataset_id}.{random_model_id}" + yield full_model_id + bigquery_client.delete_model(full_model_id, not_found_ok=True)