From 37cae94d4570ebc8ee1a962baebfc17d464fa129 Mon Sep 17 00:00:00 2001 From: steffnay Date: Wed, 27 Oct 2021 11:51:29 -0700 Subject: [PATCH 1/2] docs(samples): add create table hive partitioning sample --- .../create_table_external_hive_partitioned.py | 70 +++++++++++++++++++ ...te_table_external_hive_partitioned_test.py | 31 ++++++++ 2 files changed, 101 insertions(+) create mode 100644 samples/snippets/create_table_external_hive_partitioned.py create mode 100644 samples/snippets/create_table_external_hive_partitioned_test.py diff --git a/samples/snippets/create_table_external_hive_partitioned.py b/samples/snippets/create_table_external_hive_partitioned.py new file mode 100644 index 000000000..f13e849b7 --- /dev/null +++ b/samples/snippets/create_table_external_hive_partitioned.py @@ -0,0 +1,70 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_table_external_hive_partitioned(table_id: str): + original_table_id = table_id + # [START bigquery_create_table_external_hivepartitioned] + # Demonstrates creating an external table with hive partitioning. + + # TODO(developer): Set table_id to the ID of the table to create. + table_id = "your-project.your_dataset.your_table_name" + + # TODO(developer): Set source uri. + # Example file: + # gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/dt=2020-11-15/file1.parquet + uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*" + + # TODO(developer): Set source uri prefix. + source_uri_prefix = ( + "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/" + ) + + # [END bigquery_create_table_external_hivepartitioned] + table_id = original_table_id + # [START bigquery_create_table_external_hivepartitioned] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + # Configure the external data source. + external_config = bigquery.ExternalConfig("PARQUET") + external_config.source_uris = [uri] + external_config.autodetect = True + + # The layout of the files in here is compatible with the layout requirements for hive partitioning, + # so we can add an optional Hive partitioning configuration to leverage the object paths for deriving + # partitioning column information. + + # For more information on how partitions are extracted, see: + # https://cloud.google.com/bigquery/docs/hive-partitioned-queries-gcs + + # We have a "/dt=YYYY-MM-DD/" path component in our example files as documented above. + # Autolayout will expose this as a column named "dt" of type DATE. + hive_partitioning = bigquery.external_config.HivePartitioningOptions() + hive_partitioning.source_uri_prefix = source_uri_prefix + hive_partitioning.mode = "AUTO" + hive_partitioning.require_partition_filter = True + external_config.hive_partitioning = hive_partitioning + + table = bigquery.Table(table_id) + table.external_data_configuration = external_config + + table = client.create_table(table) # Make an API request. + print( + "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id) + ) + # [END bigquery_create_table_external_hivepartitioned] + return table diff --git a/samples/snippets/create_table_external_hive_partitioned_test.py b/samples/snippets/create_table_external_hive_partitioned_test.py new file mode 100644 index 000000000..c3cdddb55 --- /dev/null +++ b/samples/snippets/create_table_external_hive_partitioned_test.py @@ -0,0 +1,31 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import create_table_external_hive_partitioned + + +def test_create_table_external_hive_partitioned(capsys, random_table_id): + table = create_table_external_hive_partitioned.create_table_external_hive_partitioned( + random_table_id + ) + + out, _ = capsys.readouterr() + hive_partioning = table.external_data_configuration.hive_partitioning + assert "Created table {}".format(random_table_id) in out + assert ( + hive_partioning.source_uri_prefix + == "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/" + ) + assert hive_partioning.require_partition_filter is True + assert hive_partioning.mode == "AUTO" From d781058644ad1c68c5f0605e1c1f521820f7f5a7 Mon Sep 17 00:00:00 2001 From: steffnay Date: Wed, 27 Oct 2021 12:02:01 -0700 Subject: [PATCH 2/2] refactor --- .../create_table_external_hive_partitioned.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/samples/snippets/create_table_external_hive_partitioned.py b/samples/snippets/create_table_external_hive_partitioned.py index f13e849b7..2ff8a2220 100644 --- a/samples/snippets/create_table_external_hive_partitioned.py +++ b/samples/snippets/create_table_external_hive_partitioned.py @@ -44,6 +44,9 @@ def create_table_external_hive_partitioned(table_id: str): external_config.source_uris = [uri] external_config.autodetect = True + # Configure partitioning options. + hive_partitioning_opts = bigquery.external_config.HivePartitioningOptions() + # The layout of the files in here is compatible with the layout requirements for hive partitioning, # so we can add an optional Hive partitioning configuration to leverage the object paths for deriving # partitioning column information. @@ -53,11 +56,11 @@ def create_table_external_hive_partitioned(table_id: str): # We have a "/dt=YYYY-MM-DD/" path component in our example files as documented above. # Autolayout will expose this as a column named "dt" of type DATE. - hive_partitioning = bigquery.external_config.HivePartitioningOptions() - hive_partitioning.source_uri_prefix = source_uri_prefix - hive_partitioning.mode = "AUTO" - hive_partitioning.require_partition_filter = True - external_config.hive_partitioning = hive_partitioning + hive_partitioning_opts.mode = "AUTO" + hive_partitioning_opts.require_partition_filter = True + hive_partitioning_opts.source_uri_prefix = source_uri_prefix + + external_config.hive_partitioning = hive_partitioning_opts table = bigquery.Table(table_id) table.external_data_configuration = external_config