From bab93e0c18481635becabcb9f03b368c3236acbd Mon Sep 17 00:00:00 2001 From: avihay <2963806+grooveygr@users.noreply.github.com> Date: Thu, 6 Feb 2020 11:19:14 +0200 Subject: [PATCH 1/3] perf(bigquery): remove redundant array deepcopy deepcopy can be a very costly operation when considering large arrays with complex nested objects. refactor helpers to allow recursive conversion without copying arrays. --- google/cloud/bigquery/_helpers.py | 35 ++++++++++++++++++++++--------- tests/unit/test__helpers.py | 29 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 21a8e3636..1fd43b624 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -15,7 +15,6 @@ """Shared helper functions for BigQuery API classes.""" import base64 -import copy import datetime import decimal import re @@ -396,13 +395,9 @@ def _repeated_field_to_json(field, row_value): Returns: List[Any]: A list of JSON-serializable objects. """ - # Remove the REPEATED, but keep the other fields. This allows us to process - # each item as if it were a top-level field. - item_field = copy.deepcopy(field) - item_field._mode = "NULLABLE" values = [] for item in row_value: - values.append(_field_to_json(item_field, item)) + values.append(_single_field_to_json(field, item)) return values @@ -433,6 +428,29 @@ def _record_field_to_json(fields, row_value): return record +def _single_field_to_json(field, row_value): + """Convert a single (non-repeating) field into JSON-serializable values. + + Args: + field (google.cloud.bigquery.schema.SchemaField): + The SchemaField to use for type conversion and field name. + + row_value (Any): + Scalar or Struct to be inserted. The type + is inferred from the SchemaField's field_type. + + Returns: + Any: A JSON-serializable object. + """ + if row_value is None: + return None + + if field.field_type == "RECORD": + return _record_field_to_json(field.fields, row_value) + + return _scalar_field_to_json(field, row_value) + + def _field_to_json(field, row_value): """Convert a field into JSON-serializable values. @@ -454,10 +472,7 @@ def _field_to_json(field, row_value): if field.mode == "REPEATED": return _repeated_field_to_json(field, row_value) - if field.field_type == "RECORD": - return _record_field_to_json(field.fields, row_value) - - return _scalar_field_to_json(field, row_value) + return _single_field_to_json(field, row_value) def _snake_to_camel_case(value): diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index fa6d27c98..d58b40f76 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -805,6 +805,35 @@ def test_w_known_field_type(self): self.assertEqual(converted, str(original)) +class Test_single_field_to_json(unittest.TestCase): + def _call_fut(self, field, value): + from google.cloud.bigquery._helpers import _single_field_to_json + + return _single_field_to_json(field, value) + + def test_w_none(self): + field = _make_field("INT64") + original = None + converted = self._call_fut(field, original) + self.assertIsNone(converted) + + def test_w_record(self): + subfields = [ + _make_field("INT64", name="one"), + _make_field("STRING", name="two"), + ] + field = _make_field("RECORD", fields=subfields) + original = {"one": 42, "two": "two"} + converted = self._call_fut(field, original) + self.assertEqual(converted, {"one": "42", "two": "two"}) + + def test_w_scalar(self): + field = _make_field("INT64") + original = 42 + converted = self._call_fut(field, original) + self.assertEqual(converted, str(original)) + + class Test_repeated_field_to_json(unittest.TestCase): def _call_fut(self, field, value): from google.cloud.bigquery._helpers import _repeated_field_to_json From c9d862d56e18c42283c899d717e77c19eb5d5c3f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 6 Oct 2020 16:45:03 -0500 Subject: [PATCH 2/3] add check to ignore REPEATED mode --- google/cloud/bigquery/_helpers.py | 6 +++++- tests/unit/test__helpers.py | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index d5f099992..b5b6f455a 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -458,7 +458,11 @@ def _record_field_to_json(fields, row_value): def _single_field_to_json(field, row_value): - """Convert a single (non-repeating) field into JSON-serializable values. + """Convert a single field into JSON-serializable values. + + Ignores mode so that this can function for ARRAY / REPEATING fiels + without requiring a deepcopy of the field. See: + https://github.com/googleapis/python-bigquery/issues/6 Args: field (google.cloud.bigquery.schema.SchemaField): diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 31bc9066c..16c4fb8a5 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -834,6 +834,12 @@ def test_w_scalar(self): converted = self._call_fut(field, original) self.assertEqual(converted, str(original)) + def test_w_scalar_ignores_mode(self): + field = _make_field("STRING", mode="REPEATED") + original = "hello world" + converted = self._call_fut(field, original) + self.assertEqual(converted, original) + class Test_repeated_field_to_json(unittest.TestCase): def _call_fut(self, field, value): From 54852ae18ce9e33e8d21968b3e4d62987bfcf129 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 6 Oct 2020 16:47:19 -0500 Subject: [PATCH 3/3] Update google/cloud/bigquery/_helpers.py Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com> --- google/cloud/bigquery/_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index b5b6f455a..b59b3d794 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -460,7 +460,7 @@ def _record_field_to_json(fields, row_value): def _single_field_to_json(field, row_value): """Convert a single field into JSON-serializable values. - Ignores mode so that this can function for ARRAY / REPEATING fiels + Ignores mode so that this can function for ARRAY / REPEATING fields without requiring a deepcopy of the field. See: https://github.com/googleapis/python-bigquery/issues/6