From 96aa24545f0ef29639b70cb4f920846a13d70e10 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 9 Oct 2023 18:30:42 +0000 Subject: [PATCH 01/12] chore: consolidate pyarrow helpers --- google/cloud/bigquery/_helpers.py | 89 +++++++++++++++++++++ google/cloud/bigquery/_pandas_helpers.py | 99 ++++++------------------ tests/unit/test__pandas_helpers.py | 20 ++--- 3 files changed, 121 insertions(+), 87 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 014a721a8..5c1ca7d7a 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -15,6 +15,7 @@ """Shared helper functions for BigQuery API classes.""" import base64 +import copy import datetime import decimal import math @@ -126,6 +127,28 @@ def verify_version(self): raise LegacyBigQueryStorageError(msg) +def pyarrow_datetime(): + return pyarrow.timestamp("us", tz=None) + + +def pyarrow_numeric(): + return pyarrow.decimal128(38, 9) + + +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + +def pyarrow_time(): + return pyarrow.time64("us") + + +def pyarrow_timestamp(): + return pyarrow.timestamp("us", tz="UTC") + + class PyarrowVersions: """Version comparisons for pyarrow package.""" @@ -148,6 +171,26 @@ def installed_version(self) -> packaging.version.Version: return self._installed_version + @property + def bq_to_arrow_scalars(self) -> dict[str]: + """ + Returns: + Dict[str, Any]: + A dictionary of the mapping from BigQuery scalar types to Arrow + scalar types. + """ + return copy.deepcopy(self._BQ_TO_ARROW_SCALARS) + + @property + def arrow_scalar_ids_to_bq(self) -> dict: + """ + Returns: + Dict[Any, str]: + A dictionary of the mapping from Arrow scalar types to BigQuery + scalar types. + """ + return copy.deepcopy(self._ARROW_SCALAR_IDS_TO_BQ) + @property def use_compliant_nested_type(self) -> bool: return self.installed_version.major >= 4 @@ -188,6 +231,52 @@ def try_import(self, raise_if_error: bool = False) -> Any: raise LegacyPyarrowError(msg) return None + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py + # When modifying it be sure to update it there as well. + # Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py + self._BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + "BIGNUMERIC": pyarrow_bignumeric, + } + self._ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", + } + return pyarrow diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index a14dbec9b..58974f424 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -119,85 +119,30 @@ def __init__(self): self.done = False -def pyarrow_datetime(): - return pyarrow.timestamp("us", tz=None) - - -def pyarrow_numeric(): - return pyarrow.decimal128(38, 9) - - -def pyarrow_bignumeric(): - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - return pyarrow.decimal256(76, 38) - - -def pyarrow_time(): - return pyarrow.time64("us") - - -def pyarrow_timestamp(): - return pyarrow.timestamp("us", tz="UTC") - +### remove +# if pyarrow: +# if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): +# BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric +# # The exact decimal's scale and precision are not important, as only +# # the type ID matters, and it's the same for all decimal256 instances. +# ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" +# _BIGNUMERIC_SUPPORT = True +# else: +# _BIGNUMERIC_SUPPORT = False # pragma: NO COVER + +# else: # pragma: NO COVER +# BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER +# ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER +# _BIGNUMERIC_SUPPORT = False # pragma: NO COVER +## remove + +BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER +ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER +_BIGNUMERIC_SUPPORT = False # pragma: NO COVER if pyarrow: - # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py - # When modifying it be sure to update it there as well. - BQ_TO_ARROW_SCALARS = { - "BOOL": pyarrow.bool_, - "BOOLEAN": pyarrow.bool_, - "BYTES": pyarrow.binary, - "DATE": pyarrow.date32, - "DATETIME": pyarrow_datetime, - "FLOAT": pyarrow.float64, - "FLOAT64": pyarrow.float64, - "GEOGRAPHY": pyarrow.string, - "INT64": pyarrow.int64, - "INTEGER": pyarrow.int64, - "NUMERIC": pyarrow_numeric, - "STRING": pyarrow.string, - "TIME": pyarrow_time, - "TIMESTAMP": pyarrow_timestamp, - } - ARROW_SCALAR_IDS_TO_BQ = { - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes - pyarrow.bool_().id: "BOOL", - pyarrow.int8().id: "INT64", - pyarrow.int16().id: "INT64", - pyarrow.int32().id: "INT64", - pyarrow.int64().id: "INT64", - pyarrow.uint8().id: "INT64", - pyarrow.uint16().id: "INT64", - pyarrow.uint32().id: "INT64", - pyarrow.uint64().id: "INT64", - pyarrow.float16().id: "FLOAT64", - pyarrow.float32().id: "FLOAT64", - pyarrow.float64().id: "FLOAT64", - pyarrow.time32("ms").id: "TIME", - pyarrow.time64("ns").id: "TIME", - pyarrow.timestamp("ns").id: "TIMESTAMP", - pyarrow.date32().id: "DATE", - pyarrow.date64().id: "DATETIME", # because millisecond resolution - pyarrow.binary().id: "BYTES", - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - # The exact scale and precision don't matter, see below. - pyarrow.decimal128(38, scale=9).id: "NUMERIC", - } - - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" - _BIGNUMERIC_SUPPORT = True - else: - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER - -else: # pragma: NO COVER - BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER - ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER + BQ_TO_ARROW_SCALARS = _helpers.PYARROW_VERSIONS.bq_to_arrow_scalars + ARROW_SCALAR_IDS_TO_BQ = _helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index a4cc1fefb..7c78aad38 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -346,14 +346,14 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field07", _helpers.pyarrow_numeric()), + pyarrow.field("field08", _helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field11", _helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", module_under_test.pyarrow_time()), - pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field13", _helpers.pyarrow_time()), + pyarrow.field("field14", _helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected = pyarrow.struct(expected) @@ -394,14 +394,14 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field07", _helpers.pyarrow_numeric()), + pyarrow.field("field08", _helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field11", _helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", module_under_test.pyarrow_time()), - pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field13", _helpers.pyarrow_time()), + pyarrow.field("field14", _helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected_value_type = pyarrow.struct(expected) From cc9918ddb4b2f9e4a0701e07ae23b0f77029e0b6 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 9 Oct 2023 20:55:58 +0000 Subject: [PATCH 02/12] complete refactor --- google/cloud/bigquery/_helpers.py | 156 ----------------- google/cloud/bigquery/_pandas_helpers.py | 44 ++--- google/cloud/bigquery/_pyarrow_helpers.py | 198 ++++++++++++++++++++++ google/cloud/bigquery/client.py | 4 +- tests/unit/test__helpers.py | 63 ------- tests/unit/test__pandas_helpers.py | 25 +-- tests/unit/test__pyarrow_helpers.py | 79 +++++++++ tests/unit/test_table.py | 3 +- 8 files changed, 305 insertions(+), 267 deletions(-) create mode 100644 google/cloud/bigquery/_pyarrow_helpers.py create mode 100644 tests/unit/test__pyarrow_helpers.py diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 5c1ca7d7a..0f3ee6a27 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -58,8 +58,6 @@ _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") -_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") - _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST" @@ -127,161 +125,7 @@ def verify_version(self): raise LegacyBigQueryStorageError(msg) -def pyarrow_datetime(): - return pyarrow.timestamp("us", tz=None) - - -def pyarrow_numeric(): - return pyarrow.decimal128(38, 9) - - -def pyarrow_bignumeric(): - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - return pyarrow.decimal256(76, 38) - - -def pyarrow_time(): - return pyarrow.time64("us") - - -def pyarrow_timestamp(): - return pyarrow.timestamp("us", tz="UTC") - - -class PyarrowVersions: - """Version comparisons for pyarrow package.""" - - def __init__(self): - self._installed_version = None - - @property - def installed_version(self) -> packaging.version.Version: - """Return the parsed version of pyarrow.""" - if self._installed_version is None: - import pyarrow # type: ignore - - self._installed_version = packaging.version.parse( - # Use 0.0.0, since it is earlier than any released version. - # Legacy versions also have the same property, but - # creating a LegacyVersion has been deprecated. - # https://github.com/pypa/packaging/issues/321 - getattr(pyarrow, "__version__", "0.0.0") - ) - - return self._installed_version - - @property - def bq_to_arrow_scalars(self) -> dict[str]: - """ - Returns: - Dict[str, Any]: - A dictionary of the mapping from BigQuery scalar types to Arrow - scalar types. - """ - return copy.deepcopy(self._BQ_TO_ARROW_SCALARS) - - @property - def arrow_scalar_ids_to_bq(self) -> dict: - """ - Returns: - Dict[Any, str]: - A dictionary of the mapping from Arrow scalar types to BigQuery - scalar types. - """ - return copy.deepcopy(self._ARROW_SCALAR_IDS_TO_BQ) - - @property - def use_compliant_nested_type(self) -> bool: - return self.installed_version.major >= 4 - - def try_import(self, raise_if_error: bool = False) -> Any: - """Verify that a recent enough version of pyarrow extra is - installed. - - The function assumes that pyarrow extra is installed, and should thus - be used in places where this assumption holds. - - Because `pip` can install an outdated version of this extra despite the - constraints in `setup.py`, the calling code can use this helper to - verify the version compatibility at runtime. - - Returns: - The ``pyarrow`` module or ``None``. - - Raises: - LegacyPyarrowError: - If the pyarrow package is outdated and ``raise_if_error`` is ``True``. - """ - try: - import pyarrow - except ImportError as exc: # pragma: NO COVER - if raise_if_error: - raise LegacyPyarrowError( - f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." - ) from exc - return None - - if self.installed_version < _MIN_PYARROW_VERSION: - if raise_if_error: - msg = ( - "Dependency pyarrow is outdated, please upgrade " - f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." - ) - raise LegacyPyarrowError(msg) - return None - - # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py - # When modifying it be sure to update it there as well. - # Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py - self._BQ_TO_ARROW_SCALARS = { - "BOOL": pyarrow.bool_, - "BOOLEAN": pyarrow.bool_, - "BYTES": pyarrow.binary, - "DATE": pyarrow.date32, - "DATETIME": pyarrow_datetime, - "FLOAT": pyarrow.float64, - "FLOAT64": pyarrow.float64, - "GEOGRAPHY": pyarrow.string, - "INT64": pyarrow.int64, - "INTEGER": pyarrow.int64, - "NUMERIC": pyarrow_numeric, - "STRING": pyarrow.string, - "TIME": pyarrow_time, - "TIMESTAMP": pyarrow_timestamp, - "BIGNUMERIC": pyarrow_bignumeric, - } - self._ARROW_SCALAR_IDS_TO_BQ = { - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes - pyarrow.bool_().id: "BOOL", - pyarrow.int8().id: "INT64", - pyarrow.int16().id: "INT64", - pyarrow.int32().id: "INT64", - pyarrow.int64().id: "INT64", - pyarrow.uint8().id: "INT64", - pyarrow.uint16().id: "INT64", - pyarrow.uint32().id: "INT64", - pyarrow.uint64().id: "INT64", - pyarrow.float16().id: "FLOAT64", - pyarrow.float32().id: "FLOAT64", - pyarrow.float64().id: "FLOAT64", - pyarrow.time32("ms").id: "TIME", - pyarrow.time64("ns").id: "TIME", - pyarrow.timestamp("ns").id: "TIMESTAMP", - pyarrow.date32().id: "DATE", - pyarrow.date64().id: "DATETIME", # because millisecond resolution - pyarrow.binary().id: "BYTES", - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - # The exact scale and precision don't matter, see below. - pyarrow.decimal128(38, scale=9).id: "NUMERIC", - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", - } - - return pyarrow - - BQ_STORAGE_VERSIONS = BQStorageVersions() -PYARROW_VERSIONS = PyarrowVersions() def _not_null(value, field): diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 58974f424..12f2997fa 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -26,6 +26,7 @@ from packaging import version from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import schema try: @@ -49,7 +50,11 @@ db_dtypes_import_exception = exc date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() + +_BIGNUMERIC_SUPPORT = False +if pyarrow is not None: + _BIGNUMERIC_SUPPORT = True try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -118,33 +123,6 @@ def __init__(self): # the global interpreter lock). self.done = False - -### remove -# if pyarrow: -# if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): -# BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric -# # The exact decimal's scale and precision are not important, as only -# # the type ID matters, and it's the same for all decimal256 instances. -# ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" -# _BIGNUMERIC_SUPPORT = True -# else: -# _BIGNUMERIC_SUPPORT = False # pragma: NO COVER - -# else: # pragma: NO COVER -# BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER -# ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER -# _BIGNUMERIC_SUPPORT = False # pragma: NO COVER -## remove - -BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER -ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER -_BIGNUMERIC_SUPPORT = False # pragma: NO COVER - -if pyarrow: - BQ_TO_ARROW_SCALARS = _helpers.PYARROW_VERSIONS.bq_to_arrow_scalars - ARROW_SCALAR_IDS_TO_BQ = _helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq - - BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { "GEOGRAPHY": { b"ARROW:extension:name": b"google:sqlType:geography", @@ -185,7 +163,7 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) - data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper) + data_type_constructor = _pyarrow_helpers.PYARROW_VERSIONS.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None return data_type_constructor() @@ -513,7 +491,7 @@ def augment_schema(dataframe, current_bq_schema): if pyarrow.types.is_list(arrow_table.type): # `pyarrow.ListType` detected_mode = "REPEATED" - detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.values.type.id) + detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(arrow_table.values.type.id) # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds # it to such datetimes, causing them to be recognized as TIMESTAMP type. @@ -529,7 +507,7 @@ def augment_schema(dataframe, current_bq_schema): detected_type = "DATETIME" else: detected_mode = field.mode - detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id) + detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(arrow_table.type.id) if detected_type is None: unknown_type_fields.append(field) @@ -650,13 +628,13 @@ def dataframe_to_parquet( This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ - pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) import pyarrow.parquet # type: ignore kwargs = ( {"use_compliant_nested_type": parquet_use_compliant_nested_type} - if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py new file mode 100644 index 000000000..e41d12c16 --- /dev/null +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -0,0 +1,198 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for connecting BigQuery and pyarrow.""" + +from typing import Any + +from google.cloud.bigquery.exceptions import LegacyPyarrowError + +import packaging.version + +_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") + +try: + import pyarrow +except ImportError as exc: # pragma: NO COVER + raise LegacyPyarrowError( + f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." + ) from exc + +# Use 0.0.0, since it is earlier than any released version. +# Legacy versions also have the same property, but +# creating a LegacyVersion has been deprecated. +# https://github.com/pypa/packaging/issues/321 +_pyarrow_version = packaging.version.parse(getattr(pyarrow, "__version__", "0.0.0")) + +if _pyarrow_version < _MIN_PYARROW_VERSION: + msg = ( + "Dependency pyarrow is outdated, please upgrade " + f"it to version >= {_MIN_PYARROW_VERSION} (version found: {_pyarrow_version})." + ) + raise LegacyPyarrowError(msg) + + +def pyarrow_datetime(): + return pyarrow.timestamp("us", tz=None) + + +def pyarrow_numeric(): + return pyarrow.decimal128(38, 9) + + +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + +def pyarrow_time(): + return pyarrow.time64("us") + + +def pyarrow_timestamp(): + return pyarrow.timestamp("us", tz="UTC") + + +# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py +# When modifying it be sure to update it there as well. +# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py +_BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + "BIGNUMERIC": pyarrow_bignumeric, +} + +_ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", +} + + +class PyarrowVersions: + """Version comparisons for pyarrow package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of pyarrow.""" + if self._installed_version is None: + import pyarrow # type: ignore + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(pyarrow, "__version__", "0.0.0") + ) + + return self._installed_version + + @staticmethod + def bq_to_arrow_scalars(bq_scalar: str): + """ + Returns: + The Arrow scalar type that the input BigQuery scalar type maps to. + If cannot find the BigQuery scalar, return None. + """ + return _BQ_TO_ARROW_SCALARS.get(bq_scalar) + + @staticmethod + def arrow_scalar_ids_to_bq(arrow_scalar: any) -> str: + """ + Returns: + The BigQuery scalar type that the input arrow scalar type maps to. + If cannot find the arrow scalar, return None. + """ + return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar) + + @property + def use_compliant_nested_type(self) -> bool: + return self.installed_version.major >= 4 + + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pyarrow extra is + installed. + + The function assumes that pyarrow extra is installed, and should thus + be used in places where this assumption holds. + + Because `pip` can install an outdated version of this extra despite the + constraints in `setup.py`, the calling code can use this helper to + verify the version compatibility at runtime. + + Returns: + The ``pyarrow`` module or ``None``. + + Raises: + LegacyPyarrowError: + If the pyarrow package is outdated and ``raise_if_error`` is ``True``. + """ + try: + import pyarrow + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise LegacyPyarrowError( + f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." + ) from exc + return None + + if self.installed_version < _MIN_PYARROW_VERSION: + if raise_if_error: + msg = ( + "Dependency pyarrow is outdated, please upgrade " + f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." + ) + raise LegacyPyarrowError(msg) + return None + + return pyarrow + +PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index b4783fc56..2b4279f79 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -121,7 +121,7 @@ from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery.format_options import ParquetOptions -from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -2716,7 +2716,7 @@ def load_table_from_dataframe( compression=parquet_compression, **( {"use_compliant_nested_type": True} - if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ), ) diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 4fb86f665..cff48a1f2 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -24,11 +24,6 @@ except ImportError: # pragma: NO COVER bigquery_storage = None -try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None - @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): @@ -99,64 +94,6 @@ def test_is_read_session_optional_false(self): with mock.patch("google.cloud.bigquery_storage.__version__", new="2.5.0"): assert not versions.is_read_session_optional - -@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") -class TestPyarrowVersions(unittest.TestCase): - def tearDown(self): - from google.cloud.bigquery import _helpers - - # Reset any cached versions since it may not match reality. - _helpers.PYARROW_VERSIONS._installed_version = None - - def _object_under_test(self): - from google.cloud.bigquery import _helpers - - return _helpers.PyarrowVersions() - - def _call_try_import(self, **kwargs): - from google.cloud.bigquery import _helpers - - _helpers.PYARROW_VERSIONS._installed_version = None - return _helpers.PYARROW_VERSIONS.try_import(**kwargs) - - def test_try_import_raises_no_error_w_recent_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="5.0.0"): - try: - pyarrow = self._call_try_import(raise_if_error=True) - self.assertIsNotNone(pyarrow) - except LegacyPyarrowError: # pragma: NO COVER - self.fail("Legacy error raised with a non-legacy dependency version.") - - def test_try_import_returns_none_w_legacy_pyarrow(self): - with mock.patch("pyarrow.__version__", new="2.0.0"): - pyarrow = self._call_try_import() - self.assertIsNone(pyarrow) - - def test_try_import_raises_error_w_legacy_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="2.0.0"): - with self.assertRaises(LegacyPyarrowError): - self._call_try_import(raise_if_error=True) - - def test_installed_version_returns_cached(self): - versions = self._object_under_test() - versions._installed_version = object() - assert versions.installed_version is versions._installed_version - - def test_installed_version_returns_parsed_version(self): - versions = self._object_under_test() - - with mock.patch("pyarrow.__version__", new="1.2.3"): - version = versions.installed_version - - assert version.major == 1 - assert version.minor == 2 - assert version.micro == 3 - - class Test_not_null(unittest.TestCase): def _call_fut(self, value, field): from google.cloud.bigquery._helpers import _not_null diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 7c78aad38..8bfb7028c 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -41,10 +41,11 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() if pyarrow: import pyarrow.parquet @@ -346,14 +347,14 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", _helpers.pyarrow_numeric()), - pyarrow.field("field08", _helpers.pyarrow_bignumeric()), + pyarrow.field("field07", _pyarrow_helpers.pyarrow_numeric()), + pyarrow.field("field08", _pyarrow_helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", _helpers.pyarrow_timestamp()), + pyarrow.field("field11", _pyarrow_helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", _helpers.pyarrow_time()), - pyarrow.field("field14", _helpers.pyarrow_datetime()), + pyarrow.field("field13", _pyarrow_helpers.pyarrow_time()), + pyarrow.field("field14", _pyarrow_helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected = pyarrow.struct(expected) @@ -394,14 +395,14 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", _helpers.pyarrow_numeric()), - pyarrow.field("field08", _helpers.pyarrow_bignumeric()), + pyarrow.field("field07", _pyarrow_helpers.pyarrow_numeric()), + pyarrow.field("field08", _pyarrow_helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", _helpers.pyarrow_timestamp()), + pyarrow.field("field11", _pyarrow_helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", _helpers.pyarrow_time()), - pyarrow.field("field14", _helpers.pyarrow_datetime()), + pyarrow.field("field13", _pyarrow_helpers.pyarrow_time()), + pyarrow.field("field14", _pyarrow_helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected_value_type = pyarrow.struct(expected) @@ -1117,7 +1118,7 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError( "pyarrow not installed" ) - monkeypatch.setattr(_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) + monkeypatch.setattr(_pyarrow_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) with pytest.raises(exceptions.LegacyPyarrowError): module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py new file mode 100644 index 000000000..147e06cd9 --- /dev/null +++ b/tests/unit/test__pyarrow_helpers.py @@ -0,0 +1,79 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import mock + +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + + +@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") +class TestPyarrowVersions(unittest.TestCase): + def tearDown(self): + from google.cloud.bigquery import _pyarrow_helpers + + # Reset any cached versions since it may not match reality. + _pyarrow_helpers.PYARROW_VERSIONS._installed_version = None + + def _object_under_test(self): + from google.cloud.bigquery import _pyarrow_helpers + + return _pyarrow_helpers.PyarrowVersions() + + def _call_try_import(self, **kwargs): + from google.cloud.bigquery import _pyarrow_helpers + + _pyarrow_helpers.PYARROW_VERSIONS._installed_version = None + return _pyarrow_helpers.PYARROW_VERSIONS.try_import(**kwargs) + + def test_try_import_raises_no_error_w_recent_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="5.0.0"): + try: + pyarrow = self._call_try_import(raise_if_error=True) + self.assertIsNotNone(pyarrow) + except LegacyPyarrowError: # pragma: NO COVER + self.fail("Legacy error raised with a non-legacy dependency version.") + + def test_try_import_returns_none_w_legacy_pyarrow(self): + with mock.patch("pyarrow.__version__", new="2.0.0"): + pyarrow = self._call_try_import() + self.assertIsNone(pyarrow) + + def test_try_import_raises_error_w_legacy_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="2.0.0"): + with self.assertRaises(LegacyPyarrowError): + self._call_try_import(raise_if_error=True) + + def test_installed_version_returns_cached(self): + versions = self._object_under_test() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + def test_installed_version_returns_parsed_version(self): + versions = self._object_under_test() + + with mock.patch("pyarrow.__version__", new="1.2.3"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 2 + assert version.micro == 3 diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index f31dc5528..4b09b361c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -41,8 +41,9 @@ big_query_read_grpc_transport = None from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() PYARROW_VERSION = pkg_resources.parse_version("0.0.1") if pyarrow: From 4e54c0f018f9c54f8dc81042dc7d06907c2aaac3 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 9 Oct 2023 21:45:29 +0000 Subject: [PATCH 03/12] consolidate pyarrow version checking usage --- google/cloud/bigquery/client.py | 22 ++----------------- google/cloud/bigquery/table.py | 3 ++- tests/unit/test_client.py | 38 --------------------------------- tests/unit/test_table.py | 6 ------ 4 files changed, 4 insertions(+), 65 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 2b4279f79..f76b5ac12 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -27,7 +27,6 @@ import json import math import os -import packaging.version import tempfile import typing from typing import ( @@ -45,12 +44,9 @@ import uuid import warnings -try: - import pyarrow # type: ignore +from google.cloud.bigquery import _pyarrow_helpers - _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) -except ImportError: # pragma: NO COVER - pyarrow = None +pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload # type: ignore @@ -159,9 +155,6 @@ TIMEOUT_HEADER = "X-Server-Timeout" -# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 -_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) - class Project(object): """Wrapper for resource describing a BigQuery project. @@ -2688,16 +2681,6 @@ def load_table_from_dataframe( try: if new_job_config.source_format == job.SourceFormat.PARQUET: - if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS: - msg = ( - "Loading dataframe data in PARQUET format with pyarrow " - f"{_PYARROW_VERSION} can result in data corruption. It is " - "therefore *strongly* advised to use a different pyarrow " - "version or a different source format. " - "See: https://github.com/googleapis/python-bigquery/issues/781" - ) - warnings.warn(msg, category=RuntimeWarning) - if new_job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() @@ -2722,7 +2705,6 @@ def load_table_from_dataframe( ) else: - dataframe.to_csv( tmppath, index=False, diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 462447d51..55e94a319 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -60,6 +60,7 @@ import google.cloud._helpers # type: ignore from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.schema import _build_schema_resource @@ -73,7 +74,7 @@ # Unconditionally import optional dependencies again to tell pytype that # they are not None, avoiding false "no attribute" errors. import pandas - import pyarrow + pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() import geopandas # type: ignore from google.cloud import bigquery_storage # type: ignore from google.cloud.bigquery.dataset import DatasetReference diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index faa073dce..9cff2ab78 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -8608,44 +8608,6 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): parquet_compression="gzip", ) - def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): - pytest.importorskip("pandas", reason="Requires `pandas`") - pytest.importorskip("pyarrow", reason="Requires `pyarrow`") - - client = self._make_client() - records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] - dataframe = pandas.DataFrame(records) - - pyarrow_version_patch = mock.patch( - "google.cloud.bigquery.client._PYARROW_VERSION", - packaging.version.parse("2.0.0"), # A known bad version of pyarrow. - ) - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - - with load_patch, get_table_patch, pyarrow_version_patch: - with warnings.catch_warnings(record=True) as warned: - client.load_table_from_dataframe( - dataframe, - self.TABLE_REF, - location=self.LOCATION, - ) - - expected_warnings = [ - warning for warning in warned if "pyarrow" in str(warning).lower() - ] - assert len(expected_warnings) == 1 - assert issubclass(expected_warnings[0].category, RuntimeWarning) - msg = str(expected_warnings[0].message) - assert "pyarrow 2.0.0" in msg - assert "data corruption" in msg - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 4b09b361c..488ce31e8 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -44,14 +44,10 @@ from google.cloud.bigquery import _pyarrow_helpers pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("0.0.1") if pyarrow: - import pyarrow import pyarrow.types - PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) - try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -74,8 +70,6 @@ except (ImportError, AttributeError): # pragma: NO COVER tqdm = None -PYARROW_TIMESTAMP_VERSION = pkg_resources.parse_version("2.0.0") - if pandas is not None: PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version else: From ae5a4347f8978751265d150c077083a25a0e4559 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 9 Oct 2023 21:59:53 +0000 Subject: [PATCH 04/12] add unit tests --- tests/unit/test__pyarrow_helpers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index 147e06cd9..d2837ab7b 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -77,3 +77,17 @@ def test_installed_version_returns_parsed_version(self): assert version.major == 1 assert version.minor == 2 assert version.micro == 3 + + def test_bq_to_arrow_scalars(self): + from google.cloud.bigquery import _pyarrow_helpers + + versions = self._object_under_test() + + assert versions.bq_to_arrow_scalars("BIGNUMERIC") == _pyarrow_helpers.pyarrow_bignumeric + assert versions.bq_to_arrow_scalars("UNKNOWN_TYPE") is None + + def test_arrow_scalar_ids_to_bq(self): + versions = self._object_under_test() + + assert versions.arrow_scalar_ids_to_bq(pyarrow.bool_().id) == "BOOL" + assert versions.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None From 76b5cd096956d3798c33efb6af3e40f582d75bca Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 9 Oct 2023 23:35:38 +0000 Subject: [PATCH 05/12] fix unit_noextras testing error --- google/cloud/bigquery/_pyarrow_helpers.py | 118 ++++++++++------------ google/cloud/bigquery/client.py | 7 +- 2 files changed, 56 insertions(+), 69 deletions(-) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index e41d12c16..e030a73f8 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -23,25 +23,9 @@ _MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") try: - import pyarrow -except ImportError as exc: # pragma: NO COVER - raise LegacyPyarrowError( - f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." - ) from exc - -# Use 0.0.0, since it is earlier than any released version. -# Legacy versions also have the same property, but -# creating a LegacyVersion has been deprecated. -# https://github.com/pypa/packaging/issues/321 -_pyarrow_version = packaging.version.parse(getattr(pyarrow, "__version__", "0.0.0")) - -if _pyarrow_version < _MIN_PYARROW_VERSION: - msg = ( - "Dependency pyarrow is outdated, please upgrade " - f"it to version >= {_MIN_PYARROW_VERSION} (version found: {_pyarrow_version})." - ) - raise LegacyPyarrowError(msg) - + import pyarrow # type: ignore +except: + pyarrow = None def pyarrow_datetime(): return pyarrow.timestamp("us", tz=None) @@ -65,52 +49,56 @@ def pyarrow_timestamp(): return pyarrow.timestamp("us", tz="UTC") -# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py -# When modifying it be sure to update it there as well. -# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py -_BQ_TO_ARROW_SCALARS = { - "BOOL": pyarrow.bool_, - "BOOLEAN": pyarrow.bool_, - "BYTES": pyarrow.binary, - "DATE": pyarrow.date32, - "DATETIME": pyarrow_datetime, - "FLOAT": pyarrow.float64, - "FLOAT64": pyarrow.float64, - "GEOGRAPHY": pyarrow.string, - "INT64": pyarrow.int64, - "INTEGER": pyarrow.int64, - "NUMERIC": pyarrow_numeric, - "STRING": pyarrow.string, - "TIME": pyarrow_time, - "TIMESTAMP": pyarrow_timestamp, - "BIGNUMERIC": pyarrow_bignumeric, -} - -_ARROW_SCALAR_IDS_TO_BQ = { - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes - pyarrow.bool_().id: "BOOL", - pyarrow.int8().id: "INT64", - pyarrow.int16().id: "INT64", - pyarrow.int32().id: "INT64", - pyarrow.int64().id: "INT64", - pyarrow.uint8().id: "INT64", - pyarrow.uint16().id: "INT64", - pyarrow.uint32().id: "INT64", - pyarrow.uint64().id: "INT64", - pyarrow.float16().id: "FLOAT64", - pyarrow.float32().id: "FLOAT64", - pyarrow.float64().id: "FLOAT64", - pyarrow.time32("ms").id: "TIME", - pyarrow.time64("ns").id: "TIME", - pyarrow.timestamp("ns").id: "TIMESTAMP", - pyarrow.date32().id: "DATE", - pyarrow.date64().id: "DATETIME", # because millisecond resolution - pyarrow.binary().id: "BYTES", - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - # The exact scale and precision don't matter, see below. - pyarrow.decimal128(38, scale=9).id: "NUMERIC", - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", -} +_BQ_TO_ARROW_SCALARS = {} +_ARROW_SCALAR_IDS_TO_BQ = {} + +if pyarrow: + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py + # When modifying it be sure to update it there as well. + # Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py + _BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + "BIGNUMERIC": pyarrow_bignumeric, + } + + _ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", + } class PyarrowVersions: diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index f76b5ac12..e571a1e0c 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -44,10 +44,6 @@ import uuid import warnings -from google.cloud.bigquery import _pyarrow_helpers - -pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() - from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload # type: ignore from google.resumable_media.requests import ResumableUpload @@ -117,8 +113,11 @@ from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery.format_options import ParquetOptions + from google.cloud.bigquery import _pyarrow_helpers +pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() + TimeoutType = Union[float, None] ResumableTimeoutType = Union[ None, float, Tuple[float, float] From d0f8e42b73faf0b5b9964cc5755563c1380f3761 Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 10 Oct 2023 23:50:32 +0000 Subject: [PATCH 06/12] fix tests --- google/cloud/bigquery/_helpers.py | 8 ++------ google/cloud/bigquery/_pandas_helpers.py | 15 ++++++++++----- google/cloud/bigquery/_pyarrow_helpers.py | 6 ++++-- google/cloud/bigquery/table.py | 1 + noxfile.py | 7 ------- tests/system/test_client.py | 1 - tests/unit/test__helpers.py | 1 + tests/unit/test__pandas_helpers.py | 4 +++- tests/unit/test__pyarrow_helpers.py | 7 +++++-- tests/unit/test_client.py | 1 - 10 files changed, 26 insertions(+), 25 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 0f3ee6a27..4d4cdce07 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -15,13 +15,12 @@ """Shared helper functions for BigQuery API classes.""" import base64 -import copy import datetime import decimal import math import re import os -from typing import Any, Optional, Union +from typing import Optional, Union from dateutil import relativedelta from google.cloud._helpers import UTC # type: ignore @@ -33,10 +32,7 @@ import packaging.version -from google.cloud.bigquery.exceptions import ( - LegacyBigQueryStorageError, - LegacyPyarrowError, -) +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" _TIMEONLY_WO_MICROS = "%H:%M:%S" diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 12f2997fa..28940c09f 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -23,8 +23,6 @@ import warnings from typing import Any, Union -from packaging import version - from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import schema @@ -123,6 +121,7 @@ def __init__(self): # the global interpreter lock). self.done = False + BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { "GEOGRAPHY": { b"ARROW:extension:name": b"google:sqlType:geography", @@ -163,7 +162,9 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) - data_type_constructor = _pyarrow_helpers.PYARROW_VERSIONS.bq_to_arrow_scalars(field_type_upper) + data_type_constructor = _pyarrow_helpers.PYARROW_VERSIONS.bq_to_arrow_scalars( + field_type_upper + ) if data_type_constructor is None: return None return data_type_constructor() @@ -491,7 +492,9 @@ def augment_schema(dataframe, current_bq_schema): if pyarrow.types.is_list(arrow_table.type): # `pyarrow.ListType` detected_mode = "REPEATED" - detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(arrow_table.values.type.id) + detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq( + arrow_table.values.type.id + ) # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds # it to such datetimes, causing them to be recognized as TIMESTAMP type. @@ -507,7 +510,9 @@ def augment_schema(dataframe, current_bq_schema): detected_type = "DATETIME" else: detected_mode = field.mode - detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq(arrow_table.type.id) + detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq( + arrow_table.type.id + ) if detected_type is None: unknown_type_fields.append(field) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index e030a73f8..421072289 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -24,9 +24,10 @@ try: import pyarrow # type: ignore -except: +except ImportError: # pragma: NO COVER pyarrow = None + def pyarrow_datetime(): return pyarrow.timestamp("us", tz=None) @@ -133,7 +134,7 @@ def bq_to_arrow_scalars(bq_scalar: str): return _BQ_TO_ARROW_SCALARS.get(bq_scalar) @staticmethod - def arrow_scalar_ids_to_bq(arrow_scalar: any) -> str: + def arrow_scalar_ids_to_bq(arrow_scalar: Any): """ Returns: The BigQuery scalar type that the input arrow scalar type maps to. @@ -183,4 +184,5 @@ def try_import(self, raise_if_error: bool = False) -> Any: return pyarrow + PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 55e94a319..5b29d30d6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -74,6 +74,7 @@ # Unconditionally import optional dependencies again to tell pytype that # they are not None, avoiding false "no attribute" errors. import pandas + pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() import geopandas # type: ignore from google.cloud import bigquery_storage # type: ignore diff --git a/noxfile.py b/noxfile.py index 93616485f..579959d42 100644 --- a/noxfile.py +++ b/noxfile.py @@ -112,13 +112,6 @@ def unit(session): @nox.session(python=[UNIT_TEST_PYTHON_VERSIONS[0], UNIT_TEST_PYTHON_VERSIONS[-1]]) def unit_noextras(session): """Run the unit test suite.""" - - # Install optional dependencies that are out-of-date. - # https://github.com/googleapis/python-bigquery/issues/933 - # There is no pyarrow 1.0.0 package for Python 3.9. - if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: - session.install("pyarrow==1.0.0") - default(session, install_extras=False) diff --git a/tests/system/test_client.py b/tests/system/test_client.py index 8fd532f4c..d3b95ec49 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -2319,7 +2319,6 @@ def _table_exists(t): def test_dbapi_create_view(dataset_id: str): - query = f""" CREATE VIEW {dataset_id}.dbapi_create_view AS SELECT name, SUM(number) AS total diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index cff48a1f2..bebb6467c 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -94,6 +94,7 @@ def test_is_read_session_optional_false(self): with mock.patch("google.cloud.bigquery_storage.__version__", new="2.5.0"): assert not versions.is_read_session_optional + class Test_not_null(unittest.TestCase): def _call_fut(self, value, field): from google.cloud.bigquery._helpers import _not_null diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 8bfb7028c..d7f18b48b 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -1118,7 +1118,9 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError( "pyarrow not installed" ) - monkeypatch.setattr(_pyarrow_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) + monkeypatch.setattr( + _pyarrow_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import + ) with pytest.raises(exceptions.LegacyPyarrowError): module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index d2837ab7b..c35916ec3 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -82,8 +82,11 @@ def test_bq_to_arrow_scalars(self): from google.cloud.bigquery import _pyarrow_helpers versions = self._object_under_test() - - assert versions.bq_to_arrow_scalars("BIGNUMERIC") == _pyarrow_helpers.pyarrow_bignumeric + + assert ( + versions.bq_to_arrow_scalars("BIGNUMERIC") + == _pyarrow_helpers.pyarrow_bignumeric + ) assert versions.bq_to_arrow_scalars("UNKNOWN_TYPE") is None def test_arrow_scalar_ids_to_bq(self): diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 9cff2ab78..d48697449 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -27,7 +27,6 @@ import warnings import mock -import packaging import requests import pytest import pkg_resources From 465e3786498e3eb584dba90b303fb197c363385d Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 14 Oct 2023 02:17:58 +0000 Subject: [PATCH 07/12] address comments --- google/cloud/bigquery/_pandas_helpers.py | 13 +-- google/cloud/bigquery/_pyarrow_helpers.py | 106 +++------------------ google/cloud/bigquery/_versions_helpers.py | 96 +++++++++++++++++++ google/cloud/bigquery/client.py | 10 +- google/cloud/bigquery/table.py | 3 +- noxfile.py | 7 ++ tests/unit/test__pandas_helpers.py | 5 +- tests/unit/test__pyarrow_helpers.py | 88 ++++------------- tests/unit/test__versions_helpers.py | 72 ++++++++++++++ tests/unit/test_client.py | 32 +++++++ tests/unit/test_table.py | 4 +- 11 files changed, 260 insertions(+), 176 deletions(-) create mode 100644 google/cloud/bigquery/_versions_helpers.py create mode 100644 tests/unit/test__versions_helpers.py diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 28940c09f..66db97090 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -25,6 +25,7 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pyarrow_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema try: @@ -48,7 +49,7 @@ db_dtypes_import_exception = exc date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype -pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() _BIGNUMERIC_SUPPORT = False if pyarrow is not None: @@ -162,7 +163,7 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) - data_type_constructor = _pyarrow_helpers.PYARROW_VERSIONS.bq_to_arrow_scalars( + data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars( field_type_upper ) if data_type_constructor is None: @@ -492,7 +493,7 @@ def augment_schema(dataframe, current_bq_schema): if pyarrow.types.is_list(arrow_table.type): # `pyarrow.ListType` detected_mode = "REPEATED" - detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq( + detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( arrow_table.values.type.id ) @@ -510,7 +511,7 @@ def augment_schema(dataframe, current_bq_schema): detected_type = "DATETIME" else: detected_mode = field.mode - detected_type = _pyarrow_helpers.PYARROW_VERSIONS.arrow_scalar_ids_to_bq( + detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( arrow_table.type.id ) @@ -633,13 +634,13 @@ def dataframe_to_parquet( This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ - pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) import pyarrow.parquet # type: ignore kwargs = ( {"use_compliant_nested_type": parquet_use_compliant_nested_type} - if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 421072289..17d40485c 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -16,12 +16,6 @@ from typing import Any -from google.cloud.bigquery.exceptions import LegacyPyarrowError - -import packaging.version - -_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") - try: import pyarrow # type: ignore except ImportError: # pragma: NO COVER @@ -101,88 +95,18 @@ def pyarrow_timestamp(): pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", } - -class PyarrowVersions: - """Version comparisons for pyarrow package.""" - - def __init__(self): - self._installed_version = None - - @property - def installed_version(self) -> packaging.version.Version: - """Return the parsed version of pyarrow.""" - if self._installed_version is None: - import pyarrow # type: ignore - - self._installed_version = packaging.version.parse( - # Use 0.0.0, since it is earlier than any released version. - # Legacy versions also have the same property, but - # creating a LegacyVersion has been deprecated. - # https://github.com/pypa/packaging/issues/321 - getattr(pyarrow, "__version__", "0.0.0") - ) - - return self._installed_version - - @staticmethod - def bq_to_arrow_scalars(bq_scalar: str): - """ - Returns: - The Arrow scalar type that the input BigQuery scalar type maps to. - If cannot find the BigQuery scalar, return None. - """ - return _BQ_TO_ARROW_SCALARS.get(bq_scalar) - - @staticmethod - def arrow_scalar_ids_to_bq(arrow_scalar: Any): - """ - Returns: - The BigQuery scalar type that the input arrow scalar type maps to. - If cannot find the arrow scalar, return None. - """ - return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar) - - @property - def use_compliant_nested_type(self) -> bool: - return self.installed_version.major >= 4 - - def try_import(self, raise_if_error: bool = False) -> Any: - """Verify that a recent enough version of pyarrow extra is - installed. - - The function assumes that pyarrow extra is installed, and should thus - be used in places where this assumption holds. - - Because `pip` can install an outdated version of this extra despite the - constraints in `setup.py`, the calling code can use this helper to - verify the version compatibility at runtime. - - Returns: - The ``pyarrow`` module or ``None``. - - Raises: - LegacyPyarrowError: - If the pyarrow package is outdated and ``raise_if_error`` is ``True``. - """ - try: - import pyarrow - except ImportError as exc: # pragma: NO COVER - if raise_if_error: - raise LegacyPyarrowError( - f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." - ) from exc - return None - - if self.installed_version < _MIN_PYARROW_VERSION: - if raise_if_error: - msg = ( - "Dependency pyarrow is outdated, please upgrade " - f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." - ) - raise LegacyPyarrowError(msg) - return None - - return pyarrow - - -PYARROW_VERSIONS = PyarrowVersions() +def bq_to_arrow_scalars(bq_scalar: str): + """ + Returns: + The Arrow scalar type that the input BigQuery scalar type maps to. + If it cannot find the BigQuery scalar, return None. + """ + return _BQ_TO_ARROW_SCALARS.get(bq_scalar) + +def arrow_scalar_ids_to_bq(arrow_scalar: Any): + """ + Returns: + The BigQuery scalar type that the input arrow scalar type maps to. + If it cannot find the arrow scalar, return None. + """ + return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py new file mode 100644 index 000000000..1d1aed0f6 --- /dev/null +++ b/google/cloud/bigquery/_versions_helpers.py @@ -0,0 +1,96 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for verifying versions of installed modules.""" + +from typing import Any + +import packaging.version + +from google.cloud.bigquery.exceptions import LegacyPyarrowError + + +_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") + +# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 +_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) + +class PyarrowVersions: + """Version comparisons for pyarrow package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of pyarrow.""" + if self._installed_version is None: + import pyarrow # type: ignore + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(pyarrow, "__version__", "0.0.0") + ) + + return self._installed_version + + @property + def use_compliant_nested_type(self) -> bool: + return self.installed_version.major >= 4 + + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pyarrow extra is installed. + + The function assumes that pyarrow extra is installed, and should thus + be used in places where this assumption holds. + + Because `pip` can install an outdated version of this extra despite + the constraints in `setup.py`, the calling code can use this helper + to verify the version compatibility at runtime. + + Returns: + The ``pyarrow`` module or ``None``. + + Raises: + LegacyPyarrowError: + If the pyarrow package is outdated and ``raise_if_error`` is + ``True``. + """ + try: + import pyarrow + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise LegacyPyarrowError( + "pyarrow package not found. Install pyarrow version >=" + f" {_MIN_PYARROW_VERSION}." + ) from exc + return None + + if self.installed_version < _MIN_PYARROW_VERSION: + if raise_if_error: + msg = ( + "Dependency pyarrow is outdated, please upgrade" + f" it to version >= {_MIN_PYARROW_VERSION}" + f" (version found: {self.installed_version})." + ) + raise LegacyPyarrowError(msg) + return None + + return pyarrow + + +PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 11eb4d27e..556bd4fad 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -76,12 +76,14 @@ from google.cloud.bigquery._helpers import _DEFAULT_HOST from google.cloud.bigquery._http import Connection from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery.exceptions import LegacyPyarrowError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -114,9 +116,7 @@ from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery.format_options import ParquetOptions -from google.cloud.bigquery import _pyarrow_helpers - -pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -2678,6 +2678,8 @@ def load_table_from_dataframe( try: if new_job_config.source_format == job.SourceFormat.PARQUET: + _versions_helpers.PYARROW_VERSIONS.try_import() + if new_job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() @@ -2696,7 +2698,7 @@ def load_table_from_dataframe( compression=parquet_compression, **( {"use_compliant_nested_type": True} - if _pyarrow_helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ), ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 5b29d30d6..8a6934ffe 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -74,8 +74,7 @@ # Unconditionally import optional dependencies again to tell pytype that # they are not None, avoiding false "no attribute" errors. import pandas - - pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() + import pyarrow import geopandas # type: ignore from google.cloud import bigquery_storage # type: ignore from google.cloud.bigquery.dataset import DatasetReference diff --git a/noxfile.py b/noxfile.py index 841a7626f..ba06f925d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -112,6 +112,13 @@ def unit(session): @nox.session(python=[UNIT_TEST_PYTHON_VERSIONS[0], UNIT_TEST_PYTHON_VERSIONS[-1]]) def unit_noextras(session): """Run the unit test suite.""" + + # Install optional dependencies that are out-of-date. + # https://github.com/googleapis/python-bigquery/issues/933 + # There is no pyarrow 1.0.0 package for Python 3.9. + if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: + session.install("pyarrow==1.0.0") + default(session, install_extras=False) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index d7f18b48b..7724f308b 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -42,10 +42,11 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pyarrow_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT -pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() if pyarrow: import pyarrow.parquet @@ -1119,7 +1120,7 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): "pyarrow not installed" ) monkeypatch.setattr( - _pyarrow_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import + _versions_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import ) with pytest.raises(exceptions.LegacyPyarrowError): diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index c35916ec3..6d9a5bf5c 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest +import pytest -import mock try: import pyarrow @@ -22,75 +21,26 @@ pyarrow = None -@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") -class TestPyarrowVersions(unittest.TestCase): - def tearDown(self): - from google.cloud.bigquery import _pyarrow_helpers +@pytest.fixture +def module_under_test(): + from google.cloud.bigquery import _pyarrow_helpers - # Reset any cached versions since it may not match reality. - _pyarrow_helpers.PYARROW_VERSIONS._installed_version = None + return _pyarrow_helpers - def _object_under_test(self): - from google.cloud.bigquery import _pyarrow_helpers - return _pyarrow_helpers.PyarrowVersions() +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_bq_to_arrow_scalars(module_under_test): + assert ( + module_under_test.bq_to_arrow_scalars("BIGNUMERIC") + == module_under_test.pyarrow_bignumeric + ) + assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None - def _call_try_import(self, **kwargs): - from google.cloud.bigquery import _pyarrow_helpers - _pyarrow_helpers.PYARROW_VERSIONS._installed_version = None - return _pyarrow_helpers.PYARROW_VERSIONS.try_import(**kwargs) - - def test_try_import_raises_no_error_w_recent_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="5.0.0"): - try: - pyarrow = self._call_try_import(raise_if_error=True) - self.assertIsNotNone(pyarrow) - except LegacyPyarrowError: # pragma: NO COVER - self.fail("Legacy error raised with a non-legacy dependency version.") - - def test_try_import_returns_none_w_legacy_pyarrow(self): - with mock.patch("pyarrow.__version__", new="2.0.0"): - pyarrow = self._call_try_import() - self.assertIsNone(pyarrow) - - def test_try_import_raises_error_w_legacy_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="2.0.0"): - with self.assertRaises(LegacyPyarrowError): - self._call_try_import(raise_if_error=True) - - def test_installed_version_returns_cached(self): - versions = self._object_under_test() - versions._installed_version = object() - assert versions.installed_version is versions._installed_version - - def test_installed_version_returns_parsed_version(self): - versions = self._object_under_test() - - with mock.patch("pyarrow.__version__", new="1.2.3"): - version = versions.installed_version - - assert version.major == 1 - assert version.minor == 2 - assert version.micro == 3 - - def test_bq_to_arrow_scalars(self): - from google.cloud.bigquery import _pyarrow_helpers - - versions = self._object_under_test() - - assert ( - versions.bq_to_arrow_scalars("BIGNUMERIC") - == _pyarrow_helpers.pyarrow_bignumeric - ) - assert versions.bq_to_arrow_scalars("UNKNOWN_TYPE") is None - - def test_arrow_scalar_ids_to_bq(self): - versions = self._object_under_test() - - assert versions.arrow_scalar_ids_to_bq(pyarrow.bool_().id) == "BOOL" - assert versions.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_arrow_scalar_ids_to_bq(module_under_test): + assert ( + module_under_test.arrow_scalar_ids_to_bq(pyarrow.bool_().id) + == "BOOL" + ) + assert module_under_test.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py new file mode 100644 index 000000000..313b8f09d --- /dev/null +++ b/tests/unit/test__versions_helpers.py @@ -0,0 +1,72 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import mock +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + +from google.cloud.bigquery import _versions_helpers + + +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_try_import_raises_no_error_w_recent_pyarrow(): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="5.0.0"): + try: + pyarrow = versions.try_import(raise_if_error=True) + assert pyarrow is not None + except LegacyPyarrowError: # pragma: NO COVER + raise ("Legacy error raised with a non-legacy dependency version.") + + +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_try_import_returns_none_w_legacy_pyarrow(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="2.0.0"): + pyarrow = versions.try_import() + assert pyarrow is None + + +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_try_import_raises_error_w_legacy_pyarrow(): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="2.0.0"): + with pytest.raises(LegacyPyarrowError): + versions.try_import(raise_if_error=True) + + +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_installed_version_returns_cached(): + versions = _versions_helpers.PyarrowVersions() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + +@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") +def test_installed_version_returns_parsed_version(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="1.2.3"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 2 + assert version.micro == 3 diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 03027abc6..75879e524 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -28,6 +28,7 @@ import mock import requests +import packaging import pytest import pkg_resources @@ -64,6 +65,7 @@ from google.cloud import bigquery from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery.exceptions import LegacyPyarrowError from google.cloud.bigquery.retry import DEFAULT_TIMEOUT from google.cloud.bigquery import ParquetOptions @@ -8605,6 +8607,36 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): parquet_compression="gzip", ) + def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): + pytest.importorskip("pandas", reason="Requires `pandas`") + pytest.importorskip("pyarrow", reason="Requires `pyarrow`") + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + pyarrow_version_patch = mock.patch( + "google.cloud.bigquery._versions_helpers.PYARROW_VERSIONS._installed_version", + packaging.version.parse("2.0.0"), # A known bad version of pyarrow. + ) + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch, get_table_patch, pyarrow_version_patch: + with pytest.raises(LegacyPyarrowError): + client.load_table_from_dataframe( + dataframe, + self.TABLE_REF, + location=self.LOCATION, + ) + + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 046e9d8d6..3e983516e 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -41,9 +41,9 @@ big_query_read_grpc_transport = None from google.cloud.bigquery import _helpers -from google.cloud.bigquery import _pyarrow_helpers +from google.cloud.bigquery import _versions_helpers -pyarrow = _pyarrow_helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() if pyarrow: import pyarrow.types From ab266a7b5568f7ebe26789be013629b26940e6eb Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 16 Oct 2023 20:15:36 +0000 Subject: [PATCH 08/12] fix tests --- google/cloud/bigquery/_pandas_helpers.py | 8 ++------ google/cloud/bigquery/_pyarrow_helpers.py | 13 +++++++++++++ google/cloud/bigquery/_versions_helpers.py | 1 + google/cloud/bigquery/client.py | 1 - google/cloud/bigquery/table.py | 1 - tests/unit/test__pyarrow_helpers.py | 5 +---- tests/unit/test__versions_helpers.py | 5 +++-- tests/unit/test_client.py | 1 - 8 files changed, 20 insertions(+), 15 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 66db97090..ea790d6c9 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -163,9 +163,7 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) - data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars( - field_type_upper - ) + data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None return data_type_constructor() @@ -511,9 +509,7 @@ def augment_schema(dataframe, current_bq_schema): detected_type = "DATETIME" else: detected_mode = field.mode - detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( - arrow_table.type.id - ) + detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) if detected_type is None: unknown_type_fields.append(field) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 17d40485c..2d83aa13a 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -16,6 +16,8 @@ from typing import Any +from packaging import version + try: import pyarrow # type: ignore except ImportError: # pragma: NO COVER @@ -95,6 +97,16 @@ def pyarrow_timestamp(): pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", } + # Adds bignumeric support only if pyarrow version >= 3.0.0 + # Decimal256 support was added to arrow 3.0.0 + # https://arrow.apache.org/blog/2021/01/25/3.0.0-release/ + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + + def bq_to_arrow_scalars(bq_scalar: str): """ Returns: @@ -103,6 +115,7 @@ def bq_to_arrow_scalars(bq_scalar: str): """ return _BQ_TO_ARROW_SCALARS.get(bq_scalar) + def arrow_scalar_ids_to_bq(arrow_scalar: Any): """ Returns: diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 1d1aed0f6..9c767d5b1 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -26,6 +26,7 @@ # https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 _PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) + class PyarrowVersions: """Version comparisons for pyarrow package.""" diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 556bd4fad..539c7959d 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -83,7 +83,6 @@ from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError -from google.cloud.bigquery.exceptions import LegacyPyarrowError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 8a6934ffe..462447d51 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -60,7 +60,6 @@ import google.cloud._helpers # type: ignore from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers -from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.schema import _build_schema_resource diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index 6d9a5bf5c..b20817018 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -39,8 +39,5 @@ def test_bq_to_arrow_scalars(module_under_test): @pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_arrow_scalar_ids_to_bq(module_under_test): - assert ( - module_under_test.arrow_scalar_ids_to_bq(pyarrow.bool_().id) - == "BOOL" - ) + assert module_under_test.arrow_scalar_ids_to_bq(pyarrow.bool_().id) == "BOOL" assert module_under_test.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index 313b8f09d..e141911c8 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -15,6 +15,7 @@ import pytest import mock + try: import pyarrow except ImportError: # pragma: NO COVER @@ -50,7 +51,7 @@ def test_try_import_raises_error_w_legacy_pyarrow(): versions = _versions_helpers.PyarrowVersions() with mock.patch("pyarrow.__version__", new="2.0.0"): - with pytest.raises(LegacyPyarrowError): + with pytest.raises(LegacyPyarrowError): versions.try_import(raise_if_error=True) @@ -69,4 +70,4 @@ def test_installed_version_returns_parsed_version(): assert version.major == 1 assert version.minor == 2 - assert version.micro == 3 + assert version.micro == 3 diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 75879e524..0adbc2999 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -8636,7 +8636,6 @@ def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): location=self.LOCATION, ) - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): From 1903dae12d4fc02f3d10ed5117cf43be901b7a9b Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 16 Oct 2023 21:05:48 +0000 Subject: [PATCH 09/12] coverage --- google/cloud/bigquery/_pyarrow_helpers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 2d83aa13a..7266e5e02 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -68,7 +68,6 @@ def pyarrow_timestamp(): "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, - "BIGNUMERIC": pyarrow_bignumeric, } _ARROW_SCALAR_IDS_TO_BQ = { @@ -94,7 +93,6 @@ def pyarrow_timestamp(): pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", } # Adds bignumeric support only if pyarrow version >= 3.0.0 From b6cec0858d90bcca2fa07de5d2e184fe4d236d97 Mon Sep 17 00:00:00 2001 From: Lingqing Gan Date: Tue, 17 Oct 2023 12:34:44 -0700 Subject: [PATCH 10/12] accept suggestion Co-authored-by: Tim Swast --- tests/unit/test__pyarrow_helpers.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py index b20817018..f0a872c88 100644 --- a/tests/unit/test__pyarrow_helpers.py +++ b/tests/unit/test__pyarrow_helpers.py @@ -15,10 +15,7 @@ import pytest -try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None +pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") @pytest.fixture @@ -28,7 +25,6 @@ def module_under_test(): return _pyarrow_helpers -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_bq_to_arrow_scalars(module_under_test): assert ( module_under_test.bq_to_arrow_scalars("BIGNUMERIC") @@ -37,7 +33,6 @@ def test_bq_to_arrow_scalars(module_under_test): assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_arrow_scalar_ids_to_bq(module_under_test): assert module_under_test.arrow_scalar_ids_to_bq(pyarrow.bool_().id) == "BOOL" assert module_under_test.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None From 0b827e0c514f4e232a3026f5ea238419a10f1805 Mon Sep 17 00:00:00 2001 From: Lingqing Gan Date: Tue, 17 Oct 2023 12:37:14 -0700 Subject: [PATCH 11/12] accept suggestion Co-authored-by: Tim Swast --- tests/unit/test__versions_helpers.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index e141911c8..4b7782a41 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -16,15 +16,11 @@ import mock -try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None +pyarrow = pytest.importorskip("pyarrow") from google.cloud.bigquery import _versions_helpers -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_try_import_raises_no_error_w_recent_pyarrow(): from google.cloud.bigquery.exceptions import LegacyPyarrowError @@ -37,7 +33,6 @@ def test_try_import_raises_no_error_w_recent_pyarrow(): raise ("Legacy error raised with a non-legacy dependency version.") -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_try_import_returns_none_w_legacy_pyarrow(): versions = _versions_helpers.PyarrowVersions() with mock.patch("pyarrow.__version__", new="2.0.0"): @@ -45,7 +40,6 @@ def test_try_import_returns_none_w_legacy_pyarrow(): assert pyarrow is None -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_try_import_raises_error_w_legacy_pyarrow(): from google.cloud.bigquery.exceptions import LegacyPyarrowError @@ -55,14 +49,12 @@ def test_try_import_raises_error_w_legacy_pyarrow(): versions.try_import(raise_if_error=True) -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_installed_version_returns_cached(): versions = _versions_helpers.PyarrowVersions() versions._installed_version = object() assert versions.installed_version is versions._installed_version -@pytest.mark.skipIf(pyarrow is None, reason="Requires `pyarrow`") def test_installed_version_returns_parsed_version(): versions = _versions_helpers.PyarrowVersions() with mock.patch("pyarrow.__version__", new="1.2.3"): From c1b8d0d3119b5dc7c37aac93b610e699267916e4 Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 17 Oct 2023 22:27:51 +0000 Subject: [PATCH 12/12] address comments --- google/cloud/bigquery/_helpers.py | 6 +++--- google/cloud/bigquery/_versions_helpers.py | 11 ++++------- google/cloud/bigquery/client.py | 6 ++---- google/cloud/bigquery/table.py | 4 ++-- tests/unit/test__helpers.py | 16 +++++++--------- tests/unit/test__versions_helpers.py | 13 +++++-------- tests/unit/test_client.py | 12 ++++-------- tests/unit/test_magics.py | 5 ++--- tests/unit/test_table.py | 10 ++++------ 9 files changed, 33 insertions(+), 50 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 4d4cdce07..488766853 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -32,7 +32,7 @@ import packaging.version -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery import exceptions _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" _TIMEONLY_WO_MICROS = "%H:%M:%S" @@ -110,7 +110,7 @@ def verify_version(self): verify the version compatibility at runtime. Raises: - LegacyBigQueryStorageError: + exceptions.LegacyBigQueryStorageError: If the google-cloud-bigquery-storage package is outdated. """ if self.installed_version < _MIN_BQ_STORAGE_VERSION: @@ -118,7 +118,7 @@ def verify_version(self): "Dependency google-cloud-bigquery-storage is outdated, please upgrade " f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})." ) - raise LegacyBigQueryStorageError(msg) + raise exceptions.LegacyBigQueryStorageError(msg) BQ_STORAGE_VERSIONS = BQStorageVersions() diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 9c767d5b1..1f04c74e0 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -18,14 +18,11 @@ import packaging.version -from google.cloud.bigquery.exceptions import LegacyPyarrowError +from google.cloud.bigquery import exceptions _MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") -# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 -_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) - class PyarrowVersions: """Version comparisons for pyarrow package.""" @@ -67,7 +64,7 @@ def try_import(self, raise_if_error: bool = False) -> Any: The ``pyarrow`` module or ``None``. Raises: - LegacyPyarrowError: + exceptions.LegacyPyarrowError: If the pyarrow package is outdated and ``raise_if_error`` is ``True``. """ @@ -75,7 +72,7 @@ def try_import(self, raise_if_error: bool = False) -> Any: import pyarrow except ImportError as exc: # pragma: NO COVER if raise_if_error: - raise LegacyPyarrowError( + raise exceptions.LegacyPyarrowError( "pyarrow package not found. Install pyarrow version >=" f" {_MIN_PYARROW_VERSION}." ) from exc @@ -88,7 +85,7 @@ def try_import(self, raise_if_error: bool = False) -> Any: f" it to version >= {_MIN_PYARROW_VERSION}" f" (version found: {self.installed_version})." ) - raise LegacyPyarrowError(msg) + raise exceptions.LegacyPyarrowError(msg) return None return pyarrow diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 539c7959d..ed75215b6 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -82,7 +82,7 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery import exceptions as bq_exceptions from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -565,7 +565,7 @@ def _ensure_bqstorage_client( try: BQ_STORAGE_VERSIONS.verify_version() - except LegacyBigQueryStorageError as exc: + except bq_exceptions.LegacyBigQueryStorageError as exc: warnings.warn(str(exc)) return None if bqstorage_client is None: @@ -2677,8 +2677,6 @@ def load_table_from_dataframe( try: if new_job_config.source_format == job.SourceFormat.PARQUET: - _versions_helpers.PYARROW_VERSIONS.try_import() - if new_job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 462447d51..a967a1795 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -61,7 +61,7 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers from google.cloud.bigquery.enums import DefaultPandasDTypes -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery import exceptions from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields @@ -1616,7 +1616,7 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): try: _helpers.BQ_STORAGE_VERSIONS.verify_version() - except LegacyBigQueryStorageError as exc: + except exceptions.LegacyBigQueryStorageError as exc: warnings.warn(str(exc)) return False diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index bebb6467c..40223f041 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -19,6 +19,8 @@ import mock +from google.cloud.bigquery import exceptions + try: from google.cloud import bigquery_storage # type: ignore except ImportError: # pragma: NO COVER @@ -45,28 +47,24 @@ def _call_fut(self): return _helpers.BQ_STORAGE_VERSIONS.verify_version() def test_raises_no_error_w_recent_bqstorage(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage.__version__", new="2.0.0"): try: self._call_fut() - except LegacyBigQueryStorageError: # pragma: NO COVER + except exceptions.LegacyBigQueryStorageError: # pragma: NO COVER self.fail("Legacy error raised with a non-legacy dependency version.") def test_raises_error_w_legacy_bqstorage(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage.__version__", new="1.9.9"): - with self.assertRaises(LegacyBigQueryStorageError): + with self.assertRaises(exceptions.LegacyBigQueryStorageError): self._call_fut() def test_raises_error_w_unknown_bqstorage_version(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage", autospec=True) as fake_module: del fake_module.__version__ error_pattern = r"version found: 0.0.0" - with self.assertRaisesRegex(LegacyBigQueryStorageError, error_pattern): + with self.assertRaisesRegex( + exceptions.LegacyBigQueryStorageError, error_pattern + ): self._call_fut() def test_installed_version_returns_cached(self): diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index 4b7782a41..21386610b 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -16,20 +16,19 @@ import mock -pyarrow = pytest.importorskip("pyarrow") - from google.cloud.bigquery import _versions_helpers +from google.cloud.bigquery import exceptions + +pyarrow = pytest.importorskip("pyarrow") def test_try_import_raises_no_error_w_recent_pyarrow(): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - versions = _versions_helpers.PyarrowVersions() with mock.patch("pyarrow.__version__", new="5.0.0"): try: pyarrow = versions.try_import(raise_if_error=True) assert pyarrow is not None - except LegacyPyarrowError: # pragma: NO COVER + except exceptions.LegacyPyarrowError: # pragma: NO COVER raise ("Legacy error raised with a non-legacy dependency version.") @@ -41,11 +40,9 @@ def test_try_import_returns_none_w_legacy_pyarrow(): def test_try_import_raises_error_w_legacy_pyarrow(): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - versions = _versions_helpers.PyarrowVersions() with mock.patch("pyarrow.__version__", new="2.0.0"): - with pytest.raises(LegacyPyarrowError): + with pytest.raises(exceptions.LegacyPyarrowError): versions.try_import(raise_if_error=True) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 0adbc2999..3143f2123 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -65,7 +65,7 @@ from google.cloud import bigquery from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.exceptions import LegacyPyarrowError +from google.cloud.bigquery import exceptions from google.cloud.bigquery.retry import DEFAULT_TIMEOUT from google.cloud.bigquery import ParquetOptions @@ -822,14 +822,12 @@ def fail_bqstorage_import(name, globals, locals, fromlist, level): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_ensure_bqstorage_client_obsolete_dependency(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: bqstorage_client = client._ensure_bqstorage_client() @@ -858,15 +856,13 @@ def test_ensure_bqstorage_client_existing_client_check_passes(self): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_ensure_bqstorage_client_existing_client_check_fails(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) mock_storage_client = mock.sentinel.mock_storage_client patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: bqstorage_client = client._ensure_bqstorage_client(mock_storage_client) @@ -8629,7 +8625,7 @@ def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): ) with load_patch, get_table_patch, pyarrow_version_patch: - with pytest.raises(LegacyPyarrowError): + with pytest.raises(exceptions.LegacyPyarrowError): client.load_table_from_dataframe( dataframe, self.TABLE_REF, diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 70bfc4d0c..0cab943f7 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -25,6 +25,7 @@ from test_utils.imports import maybe_fail_import from google.cloud import bigquery +from google.cloud.bigquery import exceptions as bq_exceptions from google.cloud.bigquery import job from google.cloud.bigquery import table from google.cloud.bigquery.retry import DEFAULT_TIMEOUT @@ -357,8 +358,6 @@ def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test__make_bqstorage_client_true_obsolete_dependency(): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) @@ -368,7 +367,7 @@ def test__make_bqstorage_client_true_obsolete_dependency(): patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=bq_exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: got = magics._make_bqstorage_client(test_client, True, {}) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 3e983516e..65eb659bf 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -28,6 +28,8 @@ import google.api_core.exceptions from test_utils.imports import maybe_fail_import +from google.cloud.bigquery import _versions_helpers +from google.cloud.bigquery import exceptions from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -40,8 +42,6 @@ bigquery_storage = None big_query_read_grpc_transport = None -from google.cloud.bigquery import _helpers -from google.cloud.bigquery import _versions_helpers pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() @@ -2257,13 +2257,11 @@ def fail_bqstorage_import(name, globals, locals, fromlist, level): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test__validate_bqstorage_returns_false_w_warning_if_obsolete_version(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - iterator = self._make_one(first_page_response=None) # not cached patcher = mock.patch( "google.cloud.bigquery.table._helpers.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: result = iterator._validate_bqstorage( @@ -2869,7 +2867,7 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): row_iterator = self._make_one(mock_client, api_request, path, schema) def mock_verify_version(): - raise _helpers.LegacyBigQueryStorageError("no bqstorage") + raise exceptions.LegacyBigQueryStorageError("no bqstorage") with mock.patch( "google.cloud.bigquery._helpers.BQ_STORAGE_VERSIONS.verify_version",