From 21b044f63d58c73d26080e6e1559d70f808fc70d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 28 Sep 2023 17:28:20 -0500 Subject: [PATCH 01/11] feat: support STRUCT data type with `Series.struct.field` to extract subfields --- bigframes/dtypes.py | 66 +++++++++++++++++-- bigframes/operations/structs.py | 47 +++++++++++++ .../pandas/core/arrays/__init__.py | 0 .../pandas/core/arrays/arrow/__init__.py | 0 .../pandas/core/arrays/arrow/accessors.py | 63 ++++++++++++++++++ 5 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 bigframes/operations/structs.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 271b8aa2f2..644fda0913 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -84,10 +84,10 @@ BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), + (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.float64, pd.Float64Dtype()), (ibis_dtypes.int64, pd.Int64Dtype()), (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))), ( @@ -100,6 +100,19 @@ pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } +IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { + ibis_dtypes.boolean: pa.bool_(), + ibis_dtypes.date: pa.date32(), + ibis_dtypes.float64: pa.float64(), + ibis_dtypes.int64: pa.int64(), + ibis_dtypes.string: pa.string(), + ibis_dtypes.time: pa.time64("us"), + ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), + ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), +} + +ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} + IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } @@ -148,11 +161,12 @@ def ibis_dtype_to_bigframes_dtype( # Special cases: Ibis supports variations on these types, but currently # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) - if isinstance(ibis_dtype, ibis_dtypes.Array) or isinstance( - ibis_dtype, ibis_dtypes.Struct - ): + if isinstance(ibis_dtype, ibis_dtypes.Array): return np.dtype("O") + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -164,6 +178,29 @@ def ibis_dtype_to_bigframes_dtype( ) +def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: + if isinstance(ibis_dtype, ibis_dtypes.Array): + return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pa.struct( + [ + (name, ibis_dtype_to_arrow_dtype(dtype)) + for name, dtype in ibis_dtype.fields.items() + ] + ) + + if ibis_dtype in IBIS_TO_ARROW: + return IBIS_TO_ARROW[ibis_dtype] + elif isinstance(ibis_dtype, ibis_dtypes.Null): + # Fallback to STRING for NULL values for most flexibility in SQL. + return IBIS_TO_ARROW[ibis_dtypes.string] + else: + raise ValueError( + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" + ) + + def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: """Converts an Ibis expression to canonical type. @@ -187,6 +224,24 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: return table.select(*casted_columns) +def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: + if pa.types.is_struct(arrow_dtype): + struct_dtype = typing.cast(pa.StructType, arrow_dtype) + return ibis_dtypes.Struct.from_tuples( + [ + (field.name, arrow_dtype_to_ibis_dtype(field.type)) + for field in struct_dtype + ] + ) + + if arrow_dtype in ARROW_TO_IBIS: + return ARROW_TO_IBIS[arrow_dtype] + else: + raise ValueError( + f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" + ) + + def bigframes_dtype_to_ibis_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> ibis_dtypes.DataType: @@ -202,6 +257,9 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ + if isinstance(bigframes_dtype, pd.ArrowDtype): + return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) + type_string = str(bigframes_dtype) if type_string in BIGFRAMES_STRING_TO_BIGFRAMES: bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[ diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py new file mode 100644 index 0000000000..f12c97981b --- /dev/null +++ b/bigframes/operations/structs.py @@ -0,0 +1,47 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import ibis.expr.types as ibis_types + +import bigframes.dataframe +import bigframes.operations +import bigframes.operations.base +import bigframes.series +import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors + + +class StructField(bigframes.operations.UnaryOp): + def __init__(self, name_or_index: str | int): + self._name_or_index = name_or_index + + def _as_ibis(self, x: ibis_types.Value): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(self._name_or_index, str): + name = self._name_or_index + else: + name = struct_value.names[self._name_or_index] + return struct_value[name] + + +class StructAccessor( + bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor +): + __doc__ = vendoracessors.StructAccessor.__doc__ + + def field(self, name_or_index: str | int) -> bigframes.series.Series: + return self._apply_unary_op(StructField(name_or_index)) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000..cabb3566ee --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,63 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arrays/arrow/accessors.py +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from bigframes import constants + + +class StructAccessor: + """ + Accessor object for structured data properties of the Series values. + """ + + def field(self, name_or_index: str | int): + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | int + Name or index of the child field to extract. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Examples + -------- + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 05105a8de1bd9e1510fa62def8e16849a725c8d8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 29 Sep 2023 10:55:02 -0500 Subject: [PATCH 02/11] implement explode --- bigframes/dataframe.py | 10 +++++- bigframes/operations/base.py | 10 +++++- bigframes/operations/structs.py | 18 ++++++++-- bigframes/series.py | 5 +++ .../pandas/core/arrays/arrow/accessors.py | 36 +++++++++++++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0d357e7c3d..5a3834f84f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -161,7 +161,15 @@ def __init__( columns=columns, # type:ignore dtype=dtype, # type:ignore ) - if pd_dataframe.size < MAX_INLINE_DF_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_DF_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pandas.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index add6af57f4..51eaad18b9 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -86,7 +86,15 @@ def __init__( if pd_series.name is None: # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1) - if pd_dataframe.size < MAX_INLINE_SERIES_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_SERIES_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pd.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index f12c97981b..80d51115d0 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -35,7 +35,7 @@ def _as_ibis(self, x: ibis_types.Value): name = self._name_or_index else: name = struct_value.names[self._name_or_index] - return struct_value[name] + return struct_value[name].name(name) class StructAccessor( @@ -44,4 +44,18 @@ class StructAccessor( __doc__ = vendoracessors.StructAccessor.__doc__ def field(self, name_or_index: str | int) -> bigframes.series.Series: - return self._apply_unary_op(StructField(name_or_index)) + series = self._apply_unary_op(StructField(name_or_index)) + if isinstance(name_or_index, str): + name = name_or_index + else: + struct_field = self._dtype.pyarrow_dtype[name_or_index] + name = struct_field.name + return series.rename(name) + + def explode(self) -> bigframes.dataframe.DataFrame: + import bigframes.pandas + + pa_type = self._dtype.pyarrow_dtype + return bigframes.pandas.concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/bigframes/series.py b/bigframes/series.py index c1c0cb0537..5efe7b3365 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -51,6 +51,7 @@ import bigframes.operations.base import bigframes.operations.datetimes as dt import bigframes.operations.strings as strings +import bigframes.operations.structs as structs import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] @@ -118,6 +119,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + @property + def struct(self) -> structs.StructAccessor: + return structs.StructAccessor(self._block) + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index cabb3566ee..7268775f25 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -61,3 +61,39 @@ def field(self, name_or_index: str | int): Name: version, dtype: int64[pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def explode(self): + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 31290425af4b161fc0c395d133540d4592b1c2e4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 10:46:23 -0500 Subject: [PATCH 03/11] fix docstrings --- noxfile.py | 2 +- .../pandas/core/arrays/arrow/accessors.py | 129 +++++++++--------- .../bigframes_vendored/sklearn/__init__.py | 0 .../sklearn/ensemble/__init__.py | 0 .../bigframes_vendored/xgboost/__init__.py | 0 5 files changed, 63 insertions(+), 68 deletions(-) create mode 100644 third_party/bigframes_vendored/sklearn/__init__.py create mode 100644 third_party/bigframes_vendored/sklearn/ensemble/__init__.py create mode 100644 third_party/bigframes_vendored/xgboost/__init__.py diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..da9dff92fe 100644 --- a/noxfile.py +++ b/noxfile.py @@ -362,7 +362,7 @@ def doctest(session: nox.sessions.Session): run_system( session=session, prefix_name="doctest", - extra_pytest_options=("--doctest-modules",), + extra_pytest_options=("--doctest-modules", "third_party"), test_folder="bigframes", check_cov=True, ) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 7268775f25..8e3ea06a3d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -15,50 +15,45 @@ def field(self, name_or_index: str | int): """ Extract a child field of a struct as a Series. - Parameters - ---------- - name_or_index : str | int - Name or index of the child field to extract. - - Returns - ------- - pandas.Series - The data corresponding to the selected child field. - - See Also - -------- - Series.struct.explode : Return all child fields as a DataFrame. - - Examples - -------- - >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> s = bpd.Series( - ... [ - ... {"version": 1, "project": "pandas"}, - ... {"version": 2, "project": "pandas"}, - ... {"version": 1, "project": "numpy"}, - ... ], - ... dtype=bpd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) - ... ) + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) Extract by field name. - >>> s.struct.field("project") - 0 pandas - 1 pandas - 2 numpy - Name: project, dtype: string[pyarrow] + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string Extract by field index. - >>> s.struct.field(0) - 0 1 - 1 2 - 2 1 - Name: version, dtype: int64[pyarrow] + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: Int64 + + Args: + name_or_index: + Name (str) or index (int) of the child field to extract. + + Returns: + Series: + The data corresponding to the selected child field. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -66,34 +61,34 @@ def explode(self): """ Extract all child fields of a struct as a DataFrame. - Returns - ------- - pandas.DataFrame - The data corresponding to all child fields. - - See Also - -------- - Series.struct.field : Return a single child field as a Series. - - Examples - -------- - >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> s = bpd.Series( - ... [ - ... {"version": 1, "project": "pandas"}, - ... {"version": 2, "project": "pandas"}, - ... {"version": 1, "project": "numpy"}, - ... ], - ... dtype=bpd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) - ... ) - - >>> s.struct.explode() - version project - 0 1 pandas - 1 2 pandas - 2 1 numpy + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract all child fields. + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + + [3 rows x 2 columns] + + Returns: + DataFrame: + The data corresponding to all child fields. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/__init__.py b/third_party/bigframes_vendored/sklearn/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/sklearn/ensemble/__init__.py b/third_party/bigframes_vendored/sklearn/ensemble/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/xgboost/__init__.py b/third_party/bigframes_vendored/xgboost/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From f4671fce4134bbcd88715a6beff639f47978f478 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 10:57:52 -0500 Subject: [PATCH 04/11] add unit tests --- tests/unit/test_dtypes.py | 64 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index bb8ae570dc..3baff2e1f5 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -85,6 +85,70 @@ def test_ibis_float32_raises_unexpected_datatype(): bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32) +IBIS_ARROW_DTYPES = ( + (ibis_dtypes.boolean, pa.bool_()), + (ibis_dtypes.date, pa.date32()), + (ibis_dtypes.Timestamp(), pa.timestamp("us")), + (ibis_dtypes.float64, pa.float64()), + ( + ibis_dtypes.Timestamp(timezone="UTC"), + pa.timestamp("us", tz="UTC"), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ("name", ibis_dtypes.string()), + ("version", ibis_dtypes.int64()), + ] + ), + pa.struct( + [ + ("name", pa.string()), + ("version", pa.int64()), + ] + ), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ( + "nested", + ibis_dtypes.Struct.from_tuples( + [ + ("field", ibis_dtypes.string()), + ] + ), + ), + ] + ), + pa.struct( + [ + ( + "nested", + pa.struct( + [ + ("field", pa.string()), + ] + ), + ), + ] + ), + ), +) + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype) + assert result == ibis_dtype + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype) + assert result == arrow_dtype + + @pytest.mark.parametrize( ["bigframes_dtype", "ibis_dtype"], [ From a18370888ef720235b2b75d9bbc788ff727cf123 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 11:37:09 -0500 Subject: [PATCH 05/11] update struct dtype tests --- tests/system/small/test_dataframe.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e71b1430e6..43dfbed426 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -884,7 +884,19 @@ def test_get_dtypes_array_struct(session): dtypes = df.dtypes pd.testing.assert_series_equal( dtypes, - pd.Series({"array_column": np.dtype("O"), "struct_column": np.dtype("O")}), + pd.Series( + { + "array_column": np.dtype("O"), + "struct_column": pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + } + ), ) From d600a1c7e53a686f14f3959f150be78b5edb4241 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 12:49:36 -0500 Subject: [PATCH 06/11] cleanup before doctest --- noxfile.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/noxfile.py b/noxfile.py index a113e1fcde..1ce3965d6e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -275,6 +275,20 @@ def install_systemtest_dependencies(session, install_test_extra, *constraints): session.install("-e", ".", *constraints) +def clean_pycache(): + paths = CURRENT_DIRECTORY.glob("**/__pycache__/**/*") + for path in paths: + path.unlink() + + paths = CURRENT_DIRECTORY.glob("**/__pycache__") + for path in paths: + path.rmdir() + + paths = CURRENT_DIRECTORY.glob("**/*.pyc") + for path in paths: + path.unlink() + + def run_system( session: nox.sessions.Session, prefix_name, @@ -286,6 +300,7 @@ def run_system( extra_pytest_options=(), ): """Run the system test suite.""" + clean_pycache() constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) From 9e90b1919401dd9a3869f07a317f2797b7868423 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 12:58:49 -0500 Subject: [PATCH 07/11] alternative workaround for mismatch import error --- noxfile.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/noxfile.py b/noxfile.py index 1ce3965d6e..15c87746f4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -275,20 +275,6 @@ def install_systemtest_dependencies(session, install_test_extra, *constraints): session.install("-e", ".", *constraints) -def clean_pycache(): - paths = CURRENT_DIRECTORY.glob("**/__pycache__/**/*") - for path in paths: - path.unlink() - - paths = CURRENT_DIRECTORY.glob("**/__pycache__") - for path in paths: - path.rmdir() - - paths = CURRENT_DIRECTORY.glob("**/*.pyc") - for path in paths: - path.unlink() - - def run_system( session: nox.sessions.Session, prefix_name, @@ -300,7 +286,6 @@ def run_system( extra_pytest_options=(), ): """Run the system test suite.""" - clean_pycache() constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) @@ -374,6 +359,9 @@ def system_noextras(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) def doctest(session: nox.sessions.Session): """Run the system test suite.""" + # Workaround https://github.com/pytest-dev/pytest/issues/9567 + os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1" + run_system( session=session, prefix_name="doctest", From e55be81e42a8c8a7f92fe15241ce6929947a558e Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 13:00:30 -0500 Subject: [PATCH 08/11] alternative workaround for mismatch import error --- .kokoro/build.sh | 3 +++ noxfile.py | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.kokoro/build.sh b/.kokoro/build.sh index a0fa4bc787..58eaa7fedf 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -26,6 +26,9 @@ cd "${PROJECT_ROOT}" # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 +# Workaround https://github.com/pytest-dev/pytest/issues/9567 +export PY_IGNORE_IMPORTMISMATCH=1 + # Debug: show build environment env | grep KOKORO diff --git a/noxfile.py b/noxfile.py index 15c87746f4..a113e1fcde 100644 --- a/noxfile.py +++ b/noxfile.py @@ -359,9 +359,6 @@ def system_noextras(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) def doctest(session: nox.sessions.Session): """Run the system test suite.""" - # Workaround https://github.com/pytest-dev/pytest/issues/9567 - os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1" - run_system( session=session, prefix_name="doctest", From 6c133143f450949295424da5c03b3f96eac7529a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 17:07:30 -0500 Subject: [PATCH 09/11] remove dead ibis null to arrow check --- bigframes/dtypes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 85b473ebbd..46a7a1cb50 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -192,9 +192,6 @@ def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if ibis_dtype in IBIS_TO_ARROW: return IBIS_TO_ARROW[ibis_dtype] - elif isinstance(ibis_dtype, ibis_dtypes.Null): - # Fallback to STRING for NULL values for most flexibility in SQL. - return IBIS_TO_ARROW[ibis_dtypes.string] else: raise ValueError( f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" From e70084995f88ebc990dad2e9172047218c62baf3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 3 Oct 2023 15:54:59 -0500 Subject: [PATCH 10/11] feat: use ArrowDtype for STRUCT columns in `to_pandas` --- bigframes/core/blocks.py | 2 ++ bigframes/dtypes.py | 17 +++++++++++++++++ tests/system/small/test_dataframe_io.py | 23 ++++++++++++++++++++--- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0161d17361..262682b0b0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -374,7 +374,9 @@ def _to_dataframe( cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] ) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" + dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema) df = result.to_dataframe( + dtypes=dtypes, bool_dtype=pd.BooleanDtype(), int_dtype=pd.Int64Dtype(), float_dtype=pd.Float64Dtype(), diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 46a7a1cb50..d8b40c6a5a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -19,6 +19,8 @@ from typing import Any, Dict, Iterable, Literal, Tuple, Union import geopandas as gpd # type: ignore +import google.cloud.bigquery as bigquery +import google.cloud.bigquery._pandas_helpers import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types @@ -401,3 +403,18 @@ def cast_ibis_value( raise TypeError( f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}" ) + + +def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: + """For each STRUCT field, make sure we specify the full type to use.""" + # TODO(swast): Also override ARRAY fields. + dtypes = {} + for field in schema: + if field.field_type == "RECORD" and field.mode != "REPEATED": + # TODO(swast): We're using a private API here. Would likely be + # better if we called `to_arrow()` and converted to a pandas + # DataFrame ourselves from that. + dtypes[field.name] = pd.ArrowDtype( + google.cloud.bigquery._pandas_helpers.bq_to_arrow_data_type(field) + ) + return dtypes diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 3886b85f40..d60083a837 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -16,6 +16,7 @@ import google.api_core.exceptions import pandas as pd +import pyarrow as pa import pytest from tests.system.utils import ( @@ -44,7 +45,7 @@ def test_to_pandas_w_correct_dtypes(scalars_df_default_index): def test_to_pandas_array_struct_correct_result(session): - """In future, we should support arrays and structs with arrow types. + """In future, we should support arrays with arrow types. For now we fall back to the current connector behavior of converting to Python objects""" df = session.read_gbq( @@ -59,11 +60,27 @@ def test_to_pandas_array_struct_correct_result(session): expected = pd.DataFrame( { "array_column": [[1, 3, 2]], - "struct_column": [{"string_field": "a", "float_field": 1.2}], + "struct_column": pd.Series( + [{"string_field": "a", "float_field": 1.2}], + dtype=pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + ), } ) expected.index = expected.index.astype("Int64") - pd.testing.assert_frame_equal(result, expected) + pd.testing.assert_series_equal(result.dtypes, expected.dtypes) + pd.testing.assert_series_equal(result["array_column"], expected["array_column"]) + # assert_series_equal not implemented for struct columns yet. Compare + # values as Python objects, instead. + pd.testing.assert_series_equal( + result["struct_column"].astype("O"), expected["struct_column"].astype("O") + ) @pytest.mark.parametrize( From 884ab1647823d5e44fcc2fddf0855f35e18a026c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 18 Oct 2023 16:07:21 +0000 Subject: [PATCH 11/11] pull private method into third-party --- bigframes/dtypes.py | 4 +- noxfile.py | 2 + .../google_cloud_bigquery/LICENSE | 202 +++++++++ .../google_cloud_bigquery/__init__.py | 13 + .../google_cloud_bigquery/_pandas_helpers.py | 158 +++++++ .../google_cloud_bigquery/tests/__init__.py | 13 + .../tests/unit/__init__.py | 13 + .../tests/unit/test_pandas_helpers.py | 413 ++++++++++++++++++ 8 files changed, 816 insertions(+), 2 deletions(-) create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/LICENSE create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/__init__.py create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/tests/__init__.py create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/__init__.py create mode 100644 third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d8b40c6a5a..da221a95ac 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -20,7 +20,6 @@ import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import google.cloud.bigquery._pandas_helpers import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types @@ -29,6 +28,7 @@ import pyarrow as pa import bigframes.constants as constants +import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -415,6 +415,6 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: # better if we called `to_arrow()` and converted to a pandas # DataFrame ourselves from that. dtypes[field.name] = pd.ArrowDtype( - google.cloud.bigquery._pandas_helpers.bq_to_arrow_data_type(field) + gcb3p_pandas_helpers.bq_to_arrow_data_type(field) ) return dtypes diff --git a/noxfile.py b/noxfile.py index 54ccdb9a87..1864da9fe7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -185,6 +185,7 @@ def run_unit(session, install_test_extra): # Run py.test against the unit tests. tests_path = os.path.join("tests", "unit") + third_party_tests_path = os.path.join("third_party", "bigframes_vendored") session.run( "py.test", "--quiet", @@ -196,6 +197,7 @@ def run_unit(session, install_test_extra): "--cov-report=term-missing", "--cov-fail-under=0", tests_path, + third_party_tests_path, *session.posargs, ) diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/LICENSE b/third_party/bigframes_vendored/google_cloud_bigquery/LICENSE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/__init__.py b/third_party/bigframes_vendored/google_cloud_bigquery/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py new file mode 100644 index 0000000000..5e2a7a7ef0 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/_pandas_helpers.py @@ -0,0 +1,158 @@ +# Original: https://github.com/googleapis/python-bigquery/blob/main/google/cloud/bigquery/_pandas_helpers.py +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for connecting BigQuery and pandas.""" + +import warnings + +import google.cloud.bigquery.schema as schema +import pyarrow + + +def pyarrow_datetime(): + return pyarrow.timestamp("us", tz=None) + + +def pyarrow_numeric(): + return pyarrow.decimal128(38, 9) + + +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + +def pyarrow_time(): + return pyarrow.time64("us") + + +def pyarrow_timestamp(): + return pyarrow.timestamp("us", tz="UTC") + + +# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py +# When modifying it be sure to update it there as well. +BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + "BIGNUMERIC": pyarrow_bignumeric, +} +ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter. Only the type ID matters, + # and it's the same for all decimal128/decimal256 instances. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", +} + + +BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { + "GEOGRAPHY": { + b"ARROW:extension:name": b"google:sqlType:geography", + b"ARROW:extension:metadata": b'{"encoding": "WKT"}', + }, + "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"}, +} + + +def bq_to_arrow_struct_data_type(field): + arrow_fields = [] + for subfield in field.fields: + arrow_subfield = bq_to_arrow_field(subfield) + if arrow_subfield: + arrow_fields.append(arrow_subfield) + else: + # Could not determine a subfield type. Fallback to type + # inference. + return None + return pyarrow.struct(arrow_fields) + + +def bq_to_arrow_data_type(field): + """Return the Arrow data type, corresponding to a given BigQuery column. + + Returns: + None: if default Arrow type inspection should be used. + """ + if field.mode is not None and field.mode.upper() == "REPEATED": + inner_type = bq_to_arrow_data_type( + schema.SchemaField(field.name, field.field_type, fields=field.fields) + ) + if inner_type: + return pyarrow.list_(inner_type) + return None + + field_type_upper = field.field_type.upper() if field.field_type else "" + if field_type_upper in schema._STRUCT_TYPES: + return bq_to_arrow_struct_data_type(field) + + data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper) + if data_type_constructor is None: + return None + return data_type_constructor() + + +def bq_to_arrow_field(bq_field, array_type=None): + """Return the Arrow field, corresponding to a given BigQuery column. + + Returns: + None: if the Arrow type cannot be determined. + """ + arrow_type = bq_to_arrow_data_type(bq_field) + if arrow_type is not None: + if array_type is not None: + arrow_type = array_type # For GEOGRAPHY, at least initially + is_nullable = bq_field.mode.upper() == "NULLABLE" + metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get( + bq_field.field_type.upper() if bq_field.field_type else "" + ) + return pyarrow.field( + bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata + ) + + warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name)) + return None diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/tests/__init__.py b/third_party/bigframes_vendored/google_cloud_bigquery/tests/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/__init__.py b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py new file mode 100644 index 0000000000..dc4a09cc54 --- /dev/null +++ b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py @@ -0,0 +1,413 @@ +# Original: https://github.com/googleapis/python-bigquery/blob/main/tests/unit/test__pandas_helpers.py +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import warnings + +from google.cloud.bigquery import schema +import pyarrow +import pyarrow.parquet +import pyarrow.types +import pytest + + +@pytest.fixture +def module_under_test(): + from third_party.bigframes_vendored.google_cloud_bigquery import _pandas_helpers + + return _pandas_helpers + + +def is_none(value): + return value is None + + +def is_datetime(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime-type + return all_( + pyarrow.types.is_timestamp, + lambda type_: type_.unit == "us", + lambda type_: type_.tz is None, + )(type_) + + +def is_numeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 38, + lambda type_: type_.scale == 9, + )(type_) + + +def is_bignumeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 76, + lambda type_: type_.scale == 38, + )(type_) + + +def is_timestamp(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp-type + return all_( + pyarrow.types.is_timestamp, + lambda type_: type_.unit == "us", + lambda type_: type_.tz == "UTC", + )(type_) + + +def do_all(functions, value): + return all((func(value) for func in functions)) + + +def all_(*functions): + return functools.partial(do_all, functions) + + +def test_is_datetime(): + assert is_datetime(pyarrow.timestamp("us", tz=None)) + assert not is_datetime(pyarrow.timestamp("ms", tz=None)) + assert not is_datetime(pyarrow.timestamp("us", tz="UTC")) + assert not is_datetime(pyarrow.timestamp("ns", tz="UTC")) + assert not is_datetime(pyarrow.string()) + + +def test_do_all(): + assert do_all((lambda _: True, lambda _: True), None) + assert not do_all((lambda _: True, lambda _: False), None) + assert not do_all((lambda _: False,), None) + + +def test_all_(): + assert all_(lambda _: True, lambda _: True)(None) + assert not all_(lambda _: True, lambda _: False)(None) + + +@pytest.mark.parametrize( + "bq_type,bq_mode,is_correct_type", + [ + ("STRING", "NULLABLE", pyarrow.types.is_string), + ("STRING", None, pyarrow.types.is_string), + ("string", "NULLABLE", pyarrow.types.is_string), + ("StRiNg", "NULLABLE", pyarrow.types.is_string), + ("BYTES", "NULLABLE", pyarrow.types.is_binary), + ("INTEGER", "NULLABLE", pyarrow.types.is_int64), + ("INT64", "NULLABLE", pyarrow.types.is_int64), + ("FLOAT", "NULLABLE", pyarrow.types.is_float64), + ("FLOAT64", "NULLABLE", pyarrow.types.is_float64), + ("NUMERIC", "NULLABLE", is_numeric), + pytest.param( + "BIGNUMERIC", + "NULLABLE", + is_bignumeric, + ), + ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), + ("BOOL", "NULLABLE", pyarrow.types.is_boolean), + ("TIMESTAMP", "NULLABLE", is_timestamp), + ("DATE", "NULLABLE", pyarrow.types.is_date32), + ("TIME", "NULLABLE", pyarrow.types.is_time64), + ("DATETIME", "NULLABLE", is_datetime), + ("GEOGRAPHY", "NULLABLE", pyarrow.types.is_string), + ("UNKNOWN_TYPE", "NULLABLE", is_none), + # Use pyarrow.list_(item_type) for repeated (array) fields. + ( + "STRING", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "STRING", + "repeated", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "STRING", + "RePeAtEd", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "BYTES", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_binary(type_.value_type), + ), + ), + ( + "INTEGER", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_int64(type_.value_type), + ), + ), + ( + "INT64", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_int64(type_.value_type), + ), + ), + ( + "FLOAT", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_float64(type_.value_type), + ), + ), + ( + "FLOAT64", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_float64(type_.value_type), + ), + ), + ( + "NUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_numeric(type_.value_type)), + ), + pytest.param( + "BIGNUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + ), + ( + "BOOLEAN", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_boolean(type_.value_type), + ), + ), + ( + "BOOL", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_boolean(type_.value_type), + ), + ), + ( + "TIMESTAMP", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_timestamp(type_.value_type)), + ), + ( + "DATE", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_date32(type_.value_type), + ), + ), + ( + "TIME", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_time64(type_.value_type), + ), + ), + ( + "DATETIME", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_datetime(type_.value_type)), + ), + ( + "GEOGRAPHY", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ("RECORD", "REPEATED", is_none), + ("UNKNOWN_TYPE", "REPEATED", is_none), + ], +) +def test_bq_to_arrow_data_type(module_under_test, bq_type, bq_mode, is_correct_type): + field = schema.SchemaField("ignored_name", bq_type, mode=bq_mode) + actual = module_under_test.bq_to_arrow_data_type(field) + assert is_correct_type(actual) + + +@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): + fields = ( + schema.SchemaField("field01", "STRING"), + schema.SchemaField("field02", "BYTES"), + schema.SchemaField("field03", "INTEGER"), + schema.SchemaField("field04", "INT64"), + schema.SchemaField("field05", "FLOAT"), + schema.SchemaField("field06", "FLOAT64"), + schema.SchemaField("field07", "NUMERIC"), + schema.SchemaField("field08", "BIGNUMERIC"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), + ) + + field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) + actual = module_under_test.bq_to_arrow_data_type(field) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), + ) + expected = pyarrow.struct(expected) + + assert pyarrow.types.is_struct(actual) + assert actual.num_fields == len(fields) + assert actual.equals(expected) + + +@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): + fields = ( + schema.SchemaField("field01", "STRING"), + schema.SchemaField("field02", "BYTES"), + schema.SchemaField("field03", "INTEGER"), + schema.SchemaField("field04", "INT64"), + schema.SchemaField("field05", "FLOAT"), + schema.SchemaField("field06", "FLOAT64"), + schema.SchemaField("field07", "NUMERIC"), + schema.SchemaField("field08", "BIGNUMERIC"), + schema.SchemaField("field09", "BOOLEAN"), + schema.SchemaField("field10", "BOOL"), + schema.SchemaField("field11", "TIMESTAMP"), + schema.SchemaField("field12", "DATE"), + schema.SchemaField("field13", "TIME"), + schema.SchemaField("field14", "DATETIME"), + schema.SchemaField("field15", "GEOGRAPHY"), + ) + + field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) + actual = module_under_test.bq_to_arrow_data_type(field) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), + ) + expected_value_type = pyarrow.struct(expected) + + assert pyarrow.types.is_list(actual) + assert pyarrow.types.is_struct(actual.value_type) + assert actual.value_type.num_fields == len(fields) + assert actual.value_type.equals(expected_value_type) + + +def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): + fields = ( + schema.SchemaField("field1", "STRING"), + schema.SchemaField("field2", "INTEGER"), + # Don't know what to convert UNKNOWN_TYPE to, let type inference work, + # instead. + schema.SchemaField("field3", "UNKNOWN_TYPE"), + ) + field = schema.SchemaField("ignored_name", "RECORD", mode="NULLABLE", fields=fields) + + with warnings.catch_warnings(record=True) as warned: + actual = module_under_test.bq_to_arrow_data_type(field) + + assert actual is None + assert len(warned) == 1 + warning = warned[0] + assert "field3" in str(warning) + + +def test_bq_to_arrow_field_type_override(module_under_test): + # When loading pandas data, we may need to override the type + # decision based on data contents, because GEOGRAPHY data can be + # stored as either text or binary. + + assert ( + module_under_test.bq_to_arrow_field(schema.SchemaField("g", "GEOGRAPHY")).type + == pyarrow.string() + ) + + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("g", "GEOGRAPHY"), + pyarrow.binary(), + ).type + == pyarrow.binary() + ) + + +@pytest.mark.parametrize( + "field_type, metadata", + [ + ("datetime", {b"ARROW:extension:name": b"google:sqlType:datetime"}), + ( + "geography", + { + b"ARROW:extension:name": b"google:sqlType:geography", + b"ARROW:extension:metadata": b'{"encoding": "WKT"}', + }, + ), + ], +) +def test_bq_to_arrow_field_metadata(module_under_test, field_type, metadata): + assert ( + module_under_test.bq_to_arrow_field( + schema.SchemaField("g", field_type) + ).metadata + == metadata + )