From c1573f0af383ca753d2d9073a0585d56b162db1a Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Oct 2023 10:28:07 -0500 Subject: [PATCH 01/23] docs: add sample for getting started with BQML --- samples/snippets/bqml_getting_started_test.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 samples/snippets/bqml_getting_started_test.py diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py new file mode 100644 index 0000000000..ccc24ea466 --- /dev/null +++ b/samples/snippets/bqml_getting_started_test.py @@ -0,0 +1,66 @@ + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def test_bqml_getting_started(): + + import bigframes.pandas as bpd + import bigframes + +# + df = bpd.read_gbq(''' + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' + ''', + index_col='rowindex') + + # make comments + + totals = df['totals'] + + #using totals, selecting id for transaction example + totals['0000fb2c-2861-40be-9c6c-309afd7e7883'] + + transactions = totals.struct.field("transactions") + + label = transactions.notnull().map({True: 1, False: 0}) + + operatingSystem = df['device'].struct.field("operatingSystem") + + operatingSystem = operatingSystem.fillna("") + + isMobile = df['device'].struct.field("isMobile") + + country = df['geoNetwork'].struct.field("country").fillna("") + + pageviews = totals.struct.field("pageviews").fillna(0) + + features = bpd.DataFrame({ + 'os': operatingSystem, + 'is_mobile': isMobile, + 'pageviews': pageviews + }) + + # printing out the dataframe + df + + from bigframes.ml.linear_model import LogisticRegression + + model = LogisticRegression() + + model.fit(features, label) + model.to_gbq('bqml_tutorial.sample_model', replace = True) From 0b69c5744e0a2909926c2dddb102e75661df6926 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 25 Oct 2023 15:33:25 +0000 Subject: [PATCH 02/23] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/bqml_getting_started_test.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index ccc24ea466..65f11de147 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -1,4 +1,3 @@ - # Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,54 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. + def test_bqml_getting_started(): - import bigframes.pandas as bpd - import bigframes + import bigframes + import bigframes.pandas as bpd -# - df = bpd.read_gbq(''' + # + df = bpd.read_gbq( + """ SELECT GENERATE_UUID() AS rowindex, * FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - ''', - index_col='rowindex') + """, + index_col="rowindex", + ) - # make comments + # make comments - totals = df['totals'] + totals = df["totals"] - #using totals, selecting id for transaction example - totals['0000fb2c-2861-40be-9c6c-309afd7e7883'] + # using totals, selecting id for transaction example + totals["0000fb2c-2861-40be-9c6c-309afd7e7883"] transactions = totals.struct.field("transactions") label = transactions.notnull().map({True: 1, False: 0}) - operatingSystem = df['device'].struct.field("operatingSystem") + operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") - isMobile = df['device'].struct.field("isMobile") + isMobile = df["device"].struct.field("isMobile") - country = df['geoNetwork'].struct.field("country").fillna("") + country = df["geoNetwork"].struct.field("country").fillna("") pageviews = totals.struct.field("pageviews").fillna(0) - features = bpd.DataFrame({ - 'os': operatingSystem, - 'is_mobile': isMobile, - 'pageviews': pageviews - }) + features = bpd.DataFrame( + {"os": operatingSystem, "is_mobile": isMobile, "pageviews": pageviews} + ) - # printing out the dataframe - df + # printing out the dataframe + df from bigframes.ml.linear_model import LogisticRegression model = LogisticRegression() model.fit(features, label) - model.to_gbq('bqml_tutorial.sample_model', replace = True) + model.to_gbq("bqml_tutorial.sample_model", replace=True) From 4e7d81c5e8ff05eeb1a99a550c87cbf47fb9fa9b Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Oct 2023 11:24:23 -0500 Subject: [PATCH 03/23] Creating clarifying comments --- samples/snippets/bqml_getting_started_test.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index ccc24ea466..5aa2665a53 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -def test_bqml_getting_started(): +def test_bqml_getting_started(): + #[START bigquery_getting_Started_bqml_tutorial] + #DataFrame created from a BigQuery table: import bigframes.pandas as bpd import bigframes -# + # Original sql query from tutorial, translated to Python using BigQuery BigFrames dataframes df = bpd.read_gbq(''' SELECT GENERATE_UUID() AS rowindex, * FROM @@ -28,39 +30,42 @@ def test_bqml_getting_started(): ''', index_col='rowindex') - # make comments - + #Printing dataframe, setting totals value totals = df['totals'] - #using totals, selecting id for transaction example + #Using totals, selecting id for transaction example totals['0000fb2c-2861-40be-9c6c-309afd7e7883'] - transactions = totals.struct.field("transactions") - + #Columns to indicate whether there was purchase label = transactions.notnull().map({True: 1, False: 0}) + #Operating systems of users operatingSystem = df['device'].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") + #Indicates whether the users devices are mobile isMobile = df['device'].struct.field("isMobile") + #Country from which the sessions originate, IP address based country = df['geoNetwork'].struct.field("country").fillna("") + #Total number of pageviews within the session pageviews = totals.struct.field("pageviews").fillna(0) + #Setting features for dataframe, features = bpd.DataFrame({ 'os': operatingSystem, 'is_mobile': isMobile, 'pageviews': pageviews }) - # printing out the dataframe + #Printing out the dataframe df - + + #Creating a logistics regression model - from bigframes.ml.linear_model import LogisticRegression - model = LogisticRegression() - + #Model training parameters, model.fit(features, label) + #Write a DataFRame to a BigQuery table- model.to_gbq('bqml_tutorial.sample_model', replace = True) From 7e2094f69ce9ba08d418fd0b8fae71acce1071ad Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Oct 2023 12:29:21 -0500 Subject: [PATCH 04/23] Merging comments with this branch --- samples/snippets/bqml_getting_started_test.py | 4 ++-- samples/snippets/bqml_kmeans_clustering.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 samples/snippets/bqml_kmeans_clustering.py diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 5aa2665a53..c2de182624 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -39,7 +39,7 @@ def test_bqml_getting_started(): #Columns to indicate whether there was purchase label = transactions.notnull().map({True: 1, False: 0}) - #Operating systems of users + #Operating systems of users, extracting child fields of a struct as a Series operatingSystem = df['device'].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") @@ -49,7 +49,7 @@ def test_bqml_getting_started(): #Country from which the sessions originate, IP address based country = df['geoNetwork'].struct.field("country").fillna("") - #Total number of pageviews within the session + #Total number of pageviews within the session, pageviews = totals.struct.field("pageviews").fillna(0) #Setting features for dataframe, diff --git a/samples/snippets/bqml_kmeans_clustering.py b/samples/snippets/bqml_kmeans_clustering.py new file mode 100644 index 0000000000..643b121d35 --- /dev/null +++ b/samples/snippets/bqml_kmeans_clustering.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file From 0fc8d092f11dc4cdfecd41d114b470abdd918716 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 25 Oct 2023 17:34:23 +0000 Subject: [PATCH 05/23] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/bqml_getting_started_test.py | 2 +- samples/snippets/bqml_kmeans_clustering.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 257756637f..c7337bfe90 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -68,4 +68,4 @@ def test_bqml_getting_started(): # Model training parameters, model.fit(features, label) # Write a DataFRame to a BigQuery table- - model.to_gbq("bqml_tutorial.sample_model", replace=True) \ No newline at end of file + model.to_gbq("bqml_tutorial.sample_model", replace=True) diff --git a/samples/snippets/bqml_kmeans_clustering.py b/samples/snippets/bqml_kmeans_clustering.py index 643b121d35..1dc90d1848 100644 --- a/samples/snippets/bqml_kmeans_clustering.py +++ b/samples/snippets/bqml_kmeans_clustering.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. From 8751f50e31798bee849b2ca17351b03910785b95 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 25 Oct 2023 13:21:14 -0500 Subject: [PATCH 06/23] corrections on comments --- samples/snippets/bqml_getting_started_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index c7337bfe90..ece023b55b 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -40,7 +40,7 @@ def test_bqml_getting_started(): # Columns to indicate whether there was purchase label = transactions.notnull().map({True: 1, False: 0}) - # Operating systems of users + # Operating systems of users, extracting child field as a struct series operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") From a67068e42c09a7fbaff6a9c1e50551e80fbc77b2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 26 Oct 2023 12:19:43 -0500 Subject: [PATCH 07/23] Correcting code comments from BQ docs --- samples/snippets/bqml_getting_started_test.py | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index ece023b55b..9e1f3d9324 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -15,57 +15,58 @@ def test_bqml_getting_started(): # [START bigquery_getting_Started_bqml_tutorial] - # DataFrame created from a BigQuery table: - import bigframes import bigframes.pandas as bpd + from bigframes.ml.linear_model import LogisticRegression - # Original sql query from tutorial, translated to Python using BigQuery BigFrames dataframes + #EXPLANATION - REFERENCE GBQ DOCS! df = bpd.read_gbq( """ - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """, + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' + """, index_col="rowindex", ) + + # Extract the total number of transactions within + # the Google Analytics session. + # + # Because the totals column is a STRUCT data type, we need to call + # Series.struct.field("transactions") to extract the transactions field. + # See the reference documentation below: + # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field + transactions = df['totals'].struct.field("transactions") - # Printing dataframe, setting totals value - totals = df["totals"] - - # Using totals, selecting id for transaction example - totals["0000fb2c-2861-40be-9c6c-309afd7e7883"] - transactions = totals.struct.field("transactions") - # Columns to indicate whether there was purchase + # If the number of transactions is NULL, the value in the label + # column is set to 0. Otherwise, it is set to 1. These values + # represent the possible outcomes. label = transactions.notnull().map({True: 1, False: 0}) - # Operating systems of users, extracting child field as a struct series + # Operating systems of users, extracting child field as a struct. operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") - # Indicates whether the users devices are mobile + # Indicates whether the users devices are mobile. isMobile = df["device"].struct.field("isMobile") - # Country from which the sessions originate, IP address based + # Country from which the sessions originate, IP address based. country = df["geoNetwork"].struct.field("country").fillna("") - # Total number of pageviews within the session - pageviews = totals.struct.field("pageviews").fillna(0) + # Total number of pageviews within the session. + pageviews = df['totals'].struct.field("pageviews").fillna(0) - # Setting features for dataframe, + # Selecting values to represent data in columns in DataFrames. features = bpd.DataFrame( {"os": operatingSystem, "is_mobile": isMobile, "pageviews": pageviews} ) - # Printing out the dataframe - df - - # Creating a logistics regression model - - from bigframes.ml.linear_model import LogisticRegression - - model = LogisticRegression() - # Model training parameters, + # Logistic Regression model splits data into two classes, giving the + # probablity the data is in one of the classes. + model = LogisticRegression() model.fit(features, label) - # Write a DataFRame to a BigQuery table- - model.to_gbq("bqml_tutorial.sample_model", replace=True) + + # When writing a DataFrame to a BigQuery table, include destinaton table + # and parameters, index defaults to "True". + model.to_gbq("bqml_tutorial.sample_model", replace=True) From 9ec139e7e275e8082379022e3eb1ce06e9664c2e Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 26 Oct 2023 17:22:07 +0000 Subject: [PATCH 08/23] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/bqml_getting_started_test.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 9e1f3d9324..3870418685 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -15,10 +15,10 @@ def test_bqml_getting_started(): # [START bigquery_getting_Started_bqml_tutorial] - import bigframes.pandas as bpd from bigframes.ml.linear_model import LogisticRegression + import bigframes.pandas as bpd - #EXPLANATION - REFERENCE GBQ DOCS! + # EXPLANATION - REFERENCE GBQ DOCS! df = bpd.read_gbq( """ SELECT GENERATE_UUID() AS rowindex, * @@ -29,18 +29,18 @@ def test_bqml_getting_started(): """, index_col="rowindex", ) - + # Extract the total number of transactions within # the Google Analytics session. # # Because the totals column is a STRUCT data type, we need to call - # Series.struct.field("transactions") to extract the transactions field. - # See the reference documentation below: + # Series.struct.field("transactions") to extract the transactions field. + # See the reference documentation below: # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field - transactions = df['totals'].struct.field("transactions") + transactions = df["totals"].struct.field("transactions") - # If the number of transactions is NULL, the value in the label - # column is set to 0. Otherwise, it is set to 1. These values + # If the number of transactions is NULL, the value in the label + # column is set to 0. Otherwise, it is set to 1. These values # represent the possible outcomes. label = transactions.notnull().map({True: 1, False: 0}) @@ -55,7 +55,7 @@ def test_bqml_getting_started(): country = df["geoNetwork"].struct.field("country").fillna("") # Total number of pageviews within the session. - pageviews = df['totals'].struct.field("pageviews").fillna(0) + pageviews = df["totals"].struct.field("pageviews").fillna(0) # Selecting values to represent data in columns in DataFrames. features = bpd.DataFrame( @@ -63,10 +63,10 @@ def test_bqml_getting_started(): ) # Logistic Regression model splits data into two classes, giving the - # probablity the data is in one of the classes. - model = LogisticRegression() + # probablity the data is in one of the classes. + model = LogisticRegression() model.fit(features, label) - # When writing a DataFrame to a BigQuery table, include destinaton table - # and parameters, index defaults to "True". - model.to_gbq("bqml_tutorial.sample_model", replace=True) + # When writing a DataFrame to a BigQuery table, include destinaton table + # and parameters, index defaults to "True". + model.to_gbq("bqml_tutorial.sample_model", replace=True) From ec651b176c41e51d951674d86da03ea9fcf7b709 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Nov 2023 12:16:03 -0500 Subject: [PATCH 09/23] Fixing code comments to reflect BQML documentation --- samples/snippets/bqml_getting_started_test.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 9e1f3d9324..e6865b538f 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,12 +14,17 @@ def test_bqml_getting_started(): - # [START bigquery_getting_Started_bqml_tutorial] + # [start bigquery_getting_Started_bqml_tutorial] import bigframes.pandas as bpd from bigframes.ml.linear_model import LogisticRegression - #EXPLANATION - REFERENCE GBQ DOCS! + # Read_gbq loads a DataFrame from BiqQuery and gives an unordered, + # unindexed data source. The default DataFrame will have an arbitary + # index and ordering. + df = bpd.read_gbq( + # Generate_UUID produces a random universally uniquee identifier + # as a STRING value. """ SELECT GENERATE_UUID() AS rowindex, * FROM @@ -44,17 +49,17 @@ def test_bqml_getting_started(): # represent the possible outcomes. label = transactions.notnull().map({True: 1, False: 0}) - # Operating systems of users, extracting child field as a struct. + # Choosing the operating system of the users devices. operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") - # Indicates whether the users devices are mobile. + # Extract whether the visitor's device is a mobile device. isMobile = df["device"].struct.field("isMobile") - # Country from which the sessions originate, IP address based. + # Extract where the visitors country of origin is. country = df["geoNetwork"].struct.field("country").fillna("") - # Total number of pageviews within the session. + # Extract the total pageviews from the totals column. pageviews = df['totals'].struct.field("pageviews").fillna(0) # Selecting values to represent data in columns in DataFrames. @@ -62,11 +67,12 @@ def test_bqml_getting_started(): {"os": operatingSystem, "is_mobile": isMobile, "pageviews": pageviews} ) - # Logistic Regression model splits data into two classes, giving the + # Logistic Regression model splits data into two classes,giving the # probablity the data is in one of the classes. model = LogisticRegression() model.fit(features, label) - # When writing a DataFrame to a BigQuery table, include destinaton table - # and parameters, index defaults to "True". + # + # model.to_gbq("bqml_tutorial.sample_model", replace=True) + # [END bigquery_getting_started_bqml_tutorial] From fbbe32bcda50f5915729844ae2cd36180a9e1dbf Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 6 Nov 2023 12:12:56 -0600 Subject: [PATCH 10/23] Correcting code comments --- samples/snippets/bqml_getting_started_test.py | 12 +++++++++--- samples/snippets/bqml_kmeans_clustering.py | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index f0b14c7df9..fb519b4b89 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -15,7 +15,7 @@ def test_bqml_getting_started(): <<<<<<< HEAD - # [start bigquery_getting_Started_bqml_tutorial] + # [START bigquery_getting_started_bqml_tutorial] import bigframes.pandas as bpd ======= # [START bigquery_getting_Started_bqml_tutorial] @@ -32,8 +32,14 @@ def test_bqml_getting_started(): # EXPLANATION - REFERENCE GBQ DOCS! >>>>>>> 9ec139e7e275e8082379022e3eb1ce06e9664c2e df = bpd.read_gbq( - # Generate_UUID produces a random universally uniquee identifier - # as a STRING value. + # Start by selecting the data you'll use for training. `read_gbq` accepts + # either a SQL query or a table ID. Since this example selects from multiple + # tables via a wildcard, use SQL to define this data. Watch issue + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + # for updates to `read_gbq` to support wildcard tables. + # + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + """ SELECT GENERATE_UUID() AS rowindex, * FROM diff --git a/samples/snippets/bqml_kmeans_clustering.py b/samples/snippets/bqml_kmeans_clustering.py index 1dc90d1848..a8d5df6fd4 100644 --- a/samples/snippets/bqml_kmeans_clustering.py +++ b/samples/snippets/bqml_kmeans_clustering.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + From a2b7f2ffbb2154b86a85fda796fd66af699f5f8c Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 7 Nov 2023 10:48:39 -0600 Subject: [PATCH 11/23] Correcting documentation code --- samples/snippets/bqml_kmeans_clustering.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/snippets/bqml_kmeans_clustering.py b/samples/snippets/bqml_kmeans_clustering.py index a8d5df6fd4..1dc90d1848 100644 --- a/samples/snippets/bqml_kmeans_clustering.py +++ b/samples/snippets/bqml_kmeans_clustering.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - From c899565c62f50bfaeb7530dc69ceac03b398bb60 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 7 Nov 2023 10:56:03 -0600 Subject: [PATCH 12/23] Correcting documentation errors --- samples/snippets/bqml_getting_started_test.py | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index fb519b4b89..1823e7c90e 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,23 +14,11 @@ def test_bqml_getting_started(): -<<<<<<< HEAD - # [START bigquery_getting_started_bqml_tutorial] - import bigframes.pandas as bpd -======= # [START bigquery_getting_Started_bqml_tutorial] ->>>>>>> 9ec139e7e275e8082379022e3eb1ce06e9664c2e from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd -<<<<<<< HEAD - # Read_gbq loads a DataFrame from BiqQuery and gives an unordered, - # unindexed data source. The default DataFrame will have an arbitary - # index and ordering. - -======= # EXPLANATION - REFERENCE GBQ DOCS! ->>>>>>> 9ec139e7e275e8082379022e3eb1ce06e9664c2e df = bpd.read_gbq( # Start by selecting the data you'll use for training. `read_gbq` accepts # either a SQL query or a table ID. Since this example selects from multiple @@ -74,30 +62,14 @@ def test_bqml_getting_started(): # Extract where the visitors country of origin is. country = df["geoNetwork"].struct.field("country").fillna("") -<<<<<<< HEAD - # Extract the total pageviews from the totals column. - pageviews = df['totals'].struct.field("pageviews").fillna(0) -======= # Total number of pageviews within the session. pageviews = df["totals"].struct.field("pageviews").fillna(0) ->>>>>>> 9ec139e7e275e8082379022e3eb1ce06e9664c2e # Selecting values to represent data in columns in DataFrames. features = bpd.DataFrame( {"os": operatingSystem, "is_mobile": isMobile, "pageviews": pageviews} ) -<<<<<<< HEAD - # Logistic Regression model splits data into two classes,giving the - # probablity the data is in one of the classes. - model = LogisticRegression() - model.fit(features, label) - - # - # - model.to_gbq("bqml_tutorial.sample_model", replace=True) - # [END bigquery_getting_started_bqml_tutorial] -======= # Logistic Regression model splits data into two classes, giving the # probablity the data is in one of the classes. model = LogisticRegression() @@ -106,4 +78,3 @@ def test_bqml_getting_started(): # When writing a DataFrame to a BigQuery table, include destinaton table # and parameters, index defaults to "True". model.to_gbq("bqml_tutorial.sample_model", replace=True) ->>>>>>> 9ec139e7e275e8082379022e3eb1ce06e9664c2e From 83644540d580b5b50866592d8a32e07e2594e6af Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 7 Nov 2023 16:58:06 +0000 Subject: [PATCH 13/23] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/bqml_getting_started_test.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 1823e7c90e..a8c8777016 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -20,14 +20,13 @@ def test_bqml_getting_started(): # EXPLANATION - REFERENCE GBQ DOCS! df = bpd.read_gbq( - # Start by selecting the data you'll use for training. `read_gbq` accepts - # either a SQL query or a table ID. Since this example selects from multiple - # tables via a wildcard, use SQL to define this data. Watch issue - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - # for updates to `read_gbq` to support wildcard tables. - # - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - + # Start by selecting the data you'll use for training. `read_gbq` accepts + # either a SQL query or a table ID. Since this example selects from multiple + # tables via a wildcard, use SQL to define this data. Watch issue + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + # for updates to `read_gbq` to support wildcard tables. + # + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 """ SELECT GENERATE_UUID() AS rowindex, * FROM @@ -59,7 +58,7 @@ def test_bqml_getting_started(): # Extract whether the visitor's device is a mobile device. isMobile = df["device"].struct.field("isMobile") - # Extract where the visitors country of origin is. + # Extract where the visitors country of origin is. country = df["geoNetwork"].struct.field("country").fillna("") # Total number of pageviews within the session. From 509c1f4013e7e358096dbea8d4056fe64584532f Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 7 Nov 2023 13:30:55 -0600 Subject: [PATCH 14/23] Correcting documentation comments and correcting features --- samples/snippets/bqml_getting_started_test.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index a8c8777016..5028028283 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -17,16 +17,16 @@ def test_bqml_getting_started(): # [START bigquery_getting_Started_bqml_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd - - # EXPLANATION - REFERENCE GBQ DOCS! + + # Start by selecting the data you'll use for training. `read_gbq` accepts + # either a SQL query or a table ID. Since this example selects from multiple + # tables via a wildcard, use SQL to define this data. Watch issue + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + # for updates to `read_gbq` to support wildcard tables. + # + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + df = bpd.read_gbq( - # Start by selecting the data you'll use for training. `read_gbq` accepts - # either a SQL query or a table ID. Since this example selects from multiple - # tables via a wildcard, use SQL to define this data. Watch issue - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - # for updates to `read_gbq` to support wildcard tables. - # - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 """ SELECT GENERATE_UUID() AS rowindex, * FROM @@ -66,7 +66,7 @@ def test_bqml_getting_started(): # Selecting values to represent data in columns in DataFrames. features = bpd.DataFrame( - {"os": operatingSystem, "is_mobile": isMobile, "pageviews": pageviews} + {"os": operatingSystem, "is_mobile": isMobile, "country": country, "pageviews": pageviews} ) # Logistic Regression model splits data into two classes, giving the @@ -77,3 +77,4 @@ def test_bqml_getting_started(): # When writing a DataFrame to a BigQuery table, include destinaton table # and parameters, index defaults to "True". model.to_gbq("bqml_tutorial.sample_model", replace=True) + # [END bigquery_getting_Started_bqml_tutorial] From 16d4f184c8099ce1fdf42da3300244c4c9f9b94b Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 7 Nov 2023 19:32:53 +0000 Subject: [PATCH 15/23] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/bqml_getting_started_test.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 5028028283..10a4ca3961 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -17,7 +17,7 @@ def test_bqml_getting_started(): # [START bigquery_getting_Started_bqml_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd - + # Start by selecting the data you'll use for training. `read_gbq` accepts # either a SQL query or a table ID. Since this example selects from multiple # tables via a wildcard, use SQL to define this data. Watch issue @@ -25,7 +25,7 @@ def test_bqml_getting_started(): # for updates to `read_gbq` to support wildcard tables. # # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - + df = bpd.read_gbq( """ SELECT GENERATE_UUID() AS rowindex, * @@ -66,7 +66,12 @@ def test_bqml_getting_started(): # Selecting values to represent data in columns in DataFrames. features = bpd.DataFrame( - {"os": operatingSystem, "is_mobile": isMobile, "country": country, "pageviews": pageviews} + { + "os": operatingSystem, + "is_mobile": isMobile, + "country": country, + "pageviews": pageviews, + } ) # Logistic Regression model splits data into two classes, giving the From 34eb65af4c476c5eb904f4640f93f47f6f14785f Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 7 Nov 2023 14:29:10 -0600 Subject: [PATCH 16/23] Correcting documention comments for code samples --- samples/snippets/bqml_getting_started_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 5028028283..df22ba38a7 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,7 +14,7 @@ def test_bqml_getting_started(): - # [START bigquery_getting_Started_bqml_tutorial] + # [START bigquery_getting_started_bqml_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd From 16c6fb098f61a546f55fed51d801e5f10a180e21 Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Fri, 10 Nov 2023 14:17:01 -0600 Subject: [PATCH 17/23] Apply suggestions from code review Correcting documentation comments Co-authored-by: Tim Swast --- samples/snippets/bqml_getting_started_test.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 10a4ca3961..d9a6ff885c 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,7 +14,7 @@ def test_bqml_getting_started(): - # [START bigquery_getting_Started_bqml_tutorial] + # [START bigquery_getting_started_bqml_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd @@ -23,11 +23,11 @@ def test_bqml_getting_started(): # tables via a wildcard, use SQL to define this data. Watch issue # https://github.com/googleapis/python-bigquery-dataframes/issues/169 # for updates to `read_gbq` to support wildcard tables. - # - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 df = bpd.read_gbq( """ + -- Since the order of rows isn't useful for the model training, + -- generate a random ID to use as the index for the DataFrame. SELECT GENERATE_UUID() AS rowindex, * FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` @@ -46,25 +46,28 @@ def test_bqml_getting_started(): # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field transactions = df["totals"].struct.field("transactions") + # The "label" values represent the outcome of the model's + # prediction. In this case, the model predicts if there are any + # ecommerce transactions within the Google Analytics session. # If the number of transactions is NULL, the value in the label - # column is set to 0. Otherwise, it is set to 1. These values - # represent the possible outcomes. + # column is set to 0. Otherwise, it is set to 1. label = transactions.notnull().map({True: 1, False: 0}) - # Choosing the operating system of the users devices. + # Extract the operating system of the visitor's device. operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") # Extract whether the visitor's device is a mobile device. isMobile = df["device"].struct.field("isMobile") - # Extract where the visitors country of origin is. + # Extract the country from which the sessions originated, based on the IP address. country = df["geoNetwork"].struct.field("country").fillna("") - # Total number of pageviews within the session. + # Extract the total number of page views within the session. pageviews = df["totals"].struct.field("pageviews").fillna(0) - # Selecting values to represent data in columns in DataFrames. + # Combine all the feature columns into a single DataFrame + # to use as training data. features = bpd.DataFrame( { "os": operatingSystem, @@ -79,7 +82,7 @@ def test_bqml_getting_started(): model = LogisticRegression() model.fit(features, label) - # When writing a DataFrame to a BigQuery table, include destinaton table - # and parameters, index defaults to "True". + # The model.fit() call above created a temporary model. + # Use the to_gbq() method to write to a permanent location. model.to_gbq("bqml_tutorial.sample_model", replace=True) - # [END bigquery_getting_Started_bqml_tutorial] + # [END bigquery_getting_started_bqml_tutorial] From 77c22b9cfa21ba875ba835e6c68f0e49b4303723 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Nov 2023 12:41:37 -0600 Subject: [PATCH 18/23] Correcting documentation comments --- samples/snippets/bqml_getting_started_test.py | 30 ++++++++++--------- samples/snippets/bqml_kmeans_clustering.py | 13 -------- 2 files changed, 16 insertions(+), 27 deletions(-) delete mode 100644 samples/snippets/bqml_kmeans_clustering.py diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index f67ee55711..03485bba24 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -26,16 +26,14 @@ def test_bqml_getting_started(): # # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - df = bpd.read_gbq( - """ - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """, - index_col="rowindex", - ) + df = bpd.read_gbq(''' + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' + ''', + index_col='rowindex') # Extract the total number of transactions within # the Google Analytics session. @@ -46,12 +44,15 @@ def test_bqml_getting_started(): # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field transactions = df["totals"].struct.field("transactions") + # The "label" values represent the outcome of the model's + # prediction. In this case, the model predicts if there are any + # ecommerce transactions within the Google Analytics session. # If the number of transactions is NULL, the value in the label # column is set to 0. Otherwise, it is set to 1. These values # represent the possible outcomes. label = transactions.notnull().map({True: 1, False: 0}) - # Choosing the operating system of the users devices. + # Extract the operating system of the visitor's device. operatingSystem = df["device"].struct.field("operatingSystem") operatingSystem = operatingSystem.fillna("") @@ -61,10 +62,11 @@ def test_bqml_getting_started(): # Extract where the visitors country of origin is. country = df["geoNetwork"].struct.field("country").fillna("") - # Total number of pageviews within the session. + # Extract the total number of page views within the session. pageviews = df["totals"].struct.field("pageviews").fillna(0) - # Selecting values to represent data in columns in DataFrames. + # Combine all the feature columns into a single DataFrame + # to use as training data features = bpd.DataFrame( { "os": operatingSystem, @@ -82,4 +84,4 @@ def test_bqml_getting_started(): # When writing a DataFrame to a BigQuery table, include destinaton table # and parameters, index defaults to "True". model.to_gbq("bqml_tutorial.sample_model", replace=True) - # [END bigquery_getting_Started_bqml_tutorial] + # [END bigquery_getting_started_bqml_tutorial] diff --git a/samples/snippets/bqml_kmeans_clustering.py b/samples/snippets/bqml_kmeans_clustering.py deleted file mode 100644 index 1dc90d1848..0000000000 --- a/samples/snippets/bqml_kmeans_clustering.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. From 93f911dcd834346a9e0bc9becb1664f5b954dc28 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Nov 2023 12:47:30 -0600 Subject: [PATCH 19/23] Correcting documentation comments --- samples/snippets/bqml_getting_started_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index d9a6ff885c..b8502eee42 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -26,8 +26,6 @@ def test_bqml_getting_started(): df = bpd.read_gbq( """ - -- Since the order of rows isn't useful for the model training, - -- generate a random ID to use as the index for the DataFrame. SELECT GENERATE_UUID() AS rowindex, * FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` From f3aee5dd88f9c9fa75136084a5f2dceb52c198a0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 Nov 2023 16:38:12 -0600 Subject: [PATCH 20/23] Apply suggestions from code review --- samples/snippets/bqml_getting_started_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index b8502eee42..d9a6ff885c 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -26,6 +26,8 @@ def test_bqml_getting_started(): df = bpd.read_gbq( """ + -- Since the order of rows isn't useful for the model training, + -- generate a random ID to use as the index for the DataFrame. SELECT GENERATE_UUID() AS rowindex, * FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` From 1ac855da4e6074aa7e47110685d904168b30455e Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 16 Nov 2023 16:46:40 -0600 Subject: [PATCH 21/23] Apply suggestions from code review --- samples/snippets/bqml_getting_started_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index d9a6ff885c..78c5f49cb4 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -40,7 +40,7 @@ def test_bqml_getting_started(): # Extract the total number of transactions within # the Google Analytics session. # - # Because the totals column is a STRUCT data type, we need to call + # Because the totals column is a STRUCT data type, call # Series.struct.field("transactions") to extract the transactions field. # See the reference documentation below: # https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field @@ -78,7 +78,7 @@ def test_bqml_getting_started(): ) # Logistic Regression model splits data into two classes, giving the - # probablity the data is in one of the classes. + # a confidence score that the data is in one of the classes. model = LogisticRegression() model.fit(features, label) From 5494f46502908b0be3ebe3a61381936e00fab861 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 12 Dec 2023 14:32:07 -0600 Subject: [PATCH 22/23] Fixtures for temporary resources --- samples/snippets/bqml_getting_started_test.py | 13 ++- samples/snippets/conftest.py | 66 +++++++++++++ third_party/geopandas/LICENSE.txt | 25 +++++ third_party/geopandas/README.md | 99 +++++++++++++++++++ 4 files changed, 199 insertions(+), 4 deletions(-) create mode 100644 samples/snippets/conftest.py create mode 100644 third_party/geopandas/LICENSE.txt create mode 100644 third_party/geopandas/README.md diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index b8502eee42..3c6361ea37 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -13,8 +13,10 @@ # limitations under the License. -def test_bqml_getting_started(): - # [START bigquery_getting_started_bqml_tutorial] +def test_bqml_getting_started(random_model_id): + your_model_id = random_model_id + + # [START bigquery_dataframes_bqml_getting_started_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd @@ -82,5 +84,8 @@ def test_bqml_getting_started(): # The model.fit() call above created a temporary model. # Use the to_gbq() method to write to a permanent location. - model.to_gbq("bqml_tutorial.sample_model", replace=True) - # [END bigquery_getting_started_bqml_tutorial] + model.to_gbq( + your_model_id, # For example: "bqml_tutorial.sample_model", + replace=True, + ) + # [END bigquery_dataframes_bqml_getting_started_tutorial] diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py new file mode 100644 index 0000000000..1ce54b3c0c --- /dev/null +++ b/samples/snippets/conftest.py @@ -0,0 +1,66 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterator + +from google.cloud import bigquery +import pytest +import test_utils.prefixer + +prefixer = test_utils.prefixer.Prefixer( + "python-bigquery-dataframes", "samples/snippets" +) + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client) -> None: + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + +@pytest.fixture(scope="session") +def bigquery_client() -> bigquery.Client: + bigquery_client = bigquery.Client() + return bigquery_client + + +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client) -> str: + return bigquery_client.project + + +@pytest.fixture(scope="session") +def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture +def random_model_id( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +) -> Iterator[str]: + """Create a new table ID each time, so random_model_id can be used as + target for load jobs. + """ + random_model_id = prefixer.create_prefix() + full_model_id = f"{project_id}.{dataset_id}.{random_model_id}" + yield full_model_id + bigquery_client.delete_model(full_model_id, not_found_ok=True) diff --git a/third_party/geopandas/LICENSE.txt b/third_party/geopandas/LICENSE.txt new file mode 100644 index 0000000000..e17722b2e6 --- /dev/null +++ b/third_party/geopandas/LICENSE.txt @@ -0,0 +1,25 @@ +Copyright (c) 2013-2022, GeoPandas developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of GeoPandas nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/third_party/geopandas/README.md b/third_party/geopandas/README.md new file mode 100644 index 0000000000..46edd2bbf1 --- /dev/null +++ b/third_party/geopandas/README.md @@ -0,0 +1,99 @@ +pypi Actions Status Coverage Status Join the chat at https://gitter.im/geopandas/geopandas Binder DOI Powered by NumFOCUS + +GeoPandas +Python tools for geographic data + +Introduction +GeoPandas is a project to add support for geographic data to pandas objects. It currently implements GeoSeries and GeoDataFrame types which are subclasses of pandas.Series and pandas.DataFrame respectively. GeoPandas objects can act on shapely geometry objects and perform geometric operations. + +GeoPandas geometry operations are cartesian. The coordinate reference system (crs) can be stored as an attribute on an object, and is automatically set when loading from a file. Objects may be transformed to new coordinate systems with the to_crs() method. There is currently no enforcement of like coordinates for operations, but that may change in the future. + +Documentation is available at geopandas.org (current release) and Read the Docs (release and development versions). + +The GeoPandas project uses an open governance model and is fiscally sponsored by NumFOCUS. Consider making a tax-deductible donation to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. + + + +Install +See the installation docs for all details. GeoPandas depends on the following packages: + +pandas +shapely +fiona +pyproj +packaging +Further, matplotlib is an optional dependency, required for plotting. Those packages depend on several low-level libraries for geospatial analysis, which can be a challenge to install. Therefore, we recommend to install GeoPandas using the conda package manager. See the installation docs for more details. + +Get in touch +Ask usage questions ("How do I?") on StackOverflow or GIS StackExchange. +Get involved in discussions on GitHub +Report bugs, suggest features or view the source code on GitHub. +For a quick question about a bug report or feature request, or Pull Request, head over to the gitter channel. +For less well defined questions or ideas, or to announce other projects of interest to GeoPandas users, ... use the mailing list. +Examples +>>> import geopandas +>>> from shapely.geometry import Polygon +>>> p1 = Polygon([(0, 0), (1, 0), (1, 1)]) +>>> p2 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) +>>> p3 = Polygon([(2, 0), (3, 0), (3, 1), (2, 1)]) +>>> g = geopandas.GeoSeries([p1, p2, p3]) +>>> g +0 POLYGON ((0 0, 1 0, 1 1, 0 0)) +1 POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0)) +2 POLYGON ((2 0, 3 0, 3 1, 2 1, 2 0)) +dtype: geometry +Example 1 + +Some geographic operations return normal pandas objects. The area property of a GeoSeries will return a pandas.Series containing the area of each item in the GeoSeries: + +>>> print(g.area) +0 0.5 +1 1.0 +2 1.0 +dtype: float64 +Other operations return GeoPandas objects: + +>>> g.buffer(0.5) +0 POLYGON ((-0.3535533905932737 0.35355339059327... +1 POLYGON ((-0.5 0, -0.5 1, -0.4975923633360985 ... +2 POLYGON ((1.5 0, 1.5 1, 1.502407636663901 1.04... +dtype: geometry +Example 2 + +GeoPandas objects also know how to plot themselves. GeoPandas uses matplotlib for plotting. To generate a plot of a GeoSeries, use: + +>>> g.plot() +GeoPandas also implements alternate constructors that can read any data format recognized by fiona. To read a zip file containing an ESRI shapefile with the boroughs boundaries of New York City (the example can be fetched using the geodatasets package): + +>>> import geodatasets +>>> nybb_path = geodatasets.get_path('nybb') +>>> boros = geopandas.read_file(nybb_path) +>>> boros.set_index('BoroCode', inplace=True) +>>> boros.sort_index(inplace=True) +>>> boros + BoroName Shape_Leng Shape_Area \ +BoroCode +1 Manhattan 359299.096471 6.364715e+08 +2 Bronx 464392.991824 1.186925e+09 +3 Brooklyn 741080.523166 1.937479e+09 +4 Queens 896344.047763 3.045213e+09 +5 Staten Island 330470.010332 1.623820e+09 + + geometry +BoroCode +1 MULTIPOLYGON (((981219.0557861328 188655.31579... +2 MULTIPOLYGON (((1012821.805786133 229228.26458... +3 MULTIPOLYGON (((1021176.479003906 151374.79699... +4 MULTIPOLYGON (((1029606.076599121 156073.81420... +5 MULTIPOLYGON (((970217.0223999023 145643.33221... +New York City boroughs + +>>> boros['geometry'].convex_hull +BoroCode +1 POLYGON ((977855.4451904297 188082.3223876953,... +2 POLYGON ((1017949.977600098 225426.8845825195,... +3 POLYGON ((988872.8212280273 146772.0317993164,... +4 POLYGON ((1000721.531799316 136681.776184082, ... +5 POLYGON ((915517.6877458114 120121.8812543372,... +dtype: geometry +Convex hulls of New York City boroughs \ No newline at end of file From 7a0429914f8509eb3a18e1f1b9c45143cddb42be Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 12 Dec 2023 14:36:15 -0600 Subject: [PATCH 23/23] Deleting files --- third_party/geopandas/LICENSE.txt | 25 -------- third_party/geopandas/README.md | 99 ------------------------------- 2 files changed, 124 deletions(-) delete mode 100644 third_party/geopandas/LICENSE.txt delete mode 100644 third_party/geopandas/README.md diff --git a/third_party/geopandas/LICENSE.txt b/third_party/geopandas/LICENSE.txt deleted file mode 100644 index e17722b2e6..0000000000 --- a/third_party/geopandas/LICENSE.txt +++ /dev/null @@ -1,25 +0,0 @@ -Copyright (c) 2013-2022, GeoPandas developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of GeoPandas nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/third_party/geopandas/README.md b/third_party/geopandas/README.md deleted file mode 100644 index 46edd2bbf1..0000000000 --- a/third_party/geopandas/README.md +++ /dev/null @@ -1,99 +0,0 @@ -pypi Actions Status Coverage Status Join the chat at https://gitter.im/geopandas/geopandas Binder DOI Powered by NumFOCUS - -GeoPandas -Python tools for geographic data - -Introduction -GeoPandas is a project to add support for geographic data to pandas objects. It currently implements GeoSeries and GeoDataFrame types which are subclasses of pandas.Series and pandas.DataFrame respectively. GeoPandas objects can act on shapely geometry objects and perform geometric operations. - -GeoPandas geometry operations are cartesian. The coordinate reference system (crs) can be stored as an attribute on an object, and is automatically set when loading from a file. Objects may be transformed to new coordinate systems with the to_crs() method. There is currently no enforcement of like coordinates for operations, but that may change in the future. - -Documentation is available at geopandas.org (current release) and Read the Docs (release and development versions). - -The GeoPandas project uses an open governance model and is fiscally sponsored by NumFOCUS. Consider making a tax-deductible donation to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. - - - -Install -See the installation docs for all details. GeoPandas depends on the following packages: - -pandas -shapely -fiona -pyproj -packaging -Further, matplotlib is an optional dependency, required for plotting. Those packages depend on several low-level libraries for geospatial analysis, which can be a challenge to install. Therefore, we recommend to install GeoPandas using the conda package manager. See the installation docs for more details. - -Get in touch -Ask usage questions ("How do I?") on StackOverflow or GIS StackExchange. -Get involved in discussions on GitHub -Report bugs, suggest features or view the source code on GitHub. -For a quick question about a bug report or feature request, or Pull Request, head over to the gitter channel. -For less well defined questions or ideas, or to announce other projects of interest to GeoPandas users, ... use the mailing list. -Examples ->>> import geopandas ->>> from shapely.geometry import Polygon ->>> p1 = Polygon([(0, 0), (1, 0), (1, 1)]) ->>> p2 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) ->>> p3 = Polygon([(2, 0), (3, 0), (3, 1), (2, 1)]) ->>> g = geopandas.GeoSeries([p1, p2, p3]) ->>> g -0 POLYGON ((0 0, 1 0, 1 1, 0 0)) -1 POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0)) -2 POLYGON ((2 0, 3 0, 3 1, 2 1, 2 0)) -dtype: geometry -Example 1 - -Some geographic operations return normal pandas objects. The area property of a GeoSeries will return a pandas.Series containing the area of each item in the GeoSeries: - ->>> print(g.area) -0 0.5 -1 1.0 -2 1.0 -dtype: float64 -Other operations return GeoPandas objects: - ->>> g.buffer(0.5) -0 POLYGON ((-0.3535533905932737 0.35355339059327... -1 POLYGON ((-0.5 0, -0.5 1, -0.4975923633360985 ... -2 POLYGON ((1.5 0, 1.5 1, 1.502407636663901 1.04... -dtype: geometry -Example 2 - -GeoPandas objects also know how to plot themselves. GeoPandas uses matplotlib for plotting. To generate a plot of a GeoSeries, use: - ->>> g.plot() -GeoPandas also implements alternate constructors that can read any data format recognized by fiona. To read a zip file containing an ESRI shapefile with the boroughs boundaries of New York City (the example can be fetched using the geodatasets package): - ->>> import geodatasets ->>> nybb_path = geodatasets.get_path('nybb') ->>> boros = geopandas.read_file(nybb_path) ->>> boros.set_index('BoroCode', inplace=True) ->>> boros.sort_index(inplace=True) ->>> boros - BoroName Shape_Leng Shape_Area \ -BoroCode -1 Manhattan 359299.096471 6.364715e+08 -2 Bronx 464392.991824 1.186925e+09 -3 Brooklyn 741080.523166 1.937479e+09 -4 Queens 896344.047763 3.045213e+09 -5 Staten Island 330470.010332 1.623820e+09 - - geometry -BoroCode -1 MULTIPOLYGON (((981219.0557861328 188655.31579... -2 MULTIPOLYGON (((1012821.805786133 229228.26458... -3 MULTIPOLYGON (((1021176.479003906 151374.79699... -4 MULTIPOLYGON (((1029606.076599121 156073.81420... -5 MULTIPOLYGON (((970217.0223999023 145643.33221... -New York City boroughs - ->>> boros['geometry'].convex_hull -BoroCode -1 POLYGON ((977855.4451904297 188082.3223876953,... -2 POLYGON ((1017949.977600098 225426.8845825195,... -3 POLYGON ((988872.8212280273 146772.0317993164,... -4 POLYGON ((1000721.531799316 136681.776184082, ... -5 POLYGON ((915517.6877458114 120121.8812543372,... -dtype: geometry -Convex hulls of New York City boroughs \ No newline at end of file