From 8929fef71ec7ab1b2794ad51576c2dc4e0116c15 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:04:59 +0000 Subject: [PATCH 01/29] deps: migrate to `ibis-framework >= "7.0.0"` This should unlock some bug fixes as well as potential `UNNEST` support in a future change. --- bigframes/remote_function.py | 9 +++------ noxfile.py | 6 +++--- setup.py | 3 +-- .../ibis/expr/operations/analytic.py | 12 +++++++----- .../ibis/expr/operations/reductions.py | 8 ++++---- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 6fc2f8e59f..fbfbc45e51 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -485,17 +485,14 @@ def remote_function_node( """Creates an Ibis node representing a remote function call.""" fields = { - name: rlz.value(type_) if type_ else rlz.any + name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) for name, type_ in zip( ibis_signature.parameter_names, ibis_signature.input_types ) } - try: - fields["output_type"] = rlz.shape_like("args", dtype=ibis_signature.output_type) # type: ignore - except TypeError: - fields["output_dtype"] = property(lambda _: ibis_signature.output_type) - fields["output_shape"] = rlz.shape_like("args") + fields["dtype"] = ibis_signature.output_type + fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..5ceaa5d06c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -84,9 +84,9 @@ "format", "docs", "docfx", - "unit", - "unit_noextras", - "system", + "unit_prerelease", + # "unit_noextras", + "system_prerelease", "doctest", "cover", "release_dry_run", diff --git a/setup.py b/setup.py index 29eacb74a9..063cdb480d 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - # TODO: Relax upper bound once we have fixed `system_prerelease` tests. - "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", + "ibis-framework[bigquery] >=7.0.0dev,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 038987cac9..9da6fe1115 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,21 +2,23 @@ from __future__ import annotations -from ibis.expr.operations.analytic import Analytic +import ibis.expr.datatypes as dt +import ibis.expr.operations.analytic as ibis_ops_analytic +import ibis.expr.operations.core as ibis_ops_core import ibis.expr.rules as rlz -class FirstNonNullValue(Analytic): +class FirstNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the first element.""" - arg = rlz.column(rlz.any) + arg: ibis_ops_core.Column[dt.Any] output_dtype = rlz.dtype_like("arg") -class LastNonNullValue(Analytic): +class LastNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the last element.""" - arg = rlz.column(rlz.any) + arg: ibis_ops_core.Column[dt.Any] output_dtype = rlz.dtype_like("arg") diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 5e6ad9ecf2..cdbb47b787 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -3,8 +3,8 @@ from __future__ import annotations import ibis.expr.datatypes as dt +import ibis.expr.operations.core as ibis_ops_core from ibis.expr.operations.reductions import Filterable, Reduction -import ibis.expr.rules as rlz class ApproximateMultiQuantile(Filterable, Reduction): @@ -13,9 +13,9 @@ class ApproximateMultiQuantile(Filterable, Reduction): See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles """ - arg = rlz.any - num_bins = rlz.value(dt.int64) - output_dtype = dt.Array(dt.float64) + arg: ibis_ops_core.Value + num_bins: ibis_ops_core.Value[dt.Int64] + output_dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] __all__ = [ From a4e995d2d977712f6c22d556c632b1bdaeb4ceeb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:22:09 +0000 Subject: [PATCH 02/29] use dtype instead of output_dtype in custom ops --- .../bigframes_vendored/ibis/expr/operations/analytic.py | 4 ++-- .../bigframes_vendored/ibis/expr/operations/reductions.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 9da6fe1115..78aeb67a37 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -12,14 +12,14 @@ class FirstNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the first element.""" arg: ibis_ops_core.Column[dt.Any] - output_dtype = rlz.dtype_like("arg") + dtype = rlz.dtype_like("arg") class LastNonNullValue(ibis_ops_analytic.Analytic): """Retrieve the last element.""" arg: ibis_ops_core.Column[dt.Any] - output_dtype = rlz.dtype_like("arg") + dtype = rlz.dtype_like("arg") __all__ = [ diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index cdbb47b787..76aa34466a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -15,7 +15,7 @@ class ApproximateMultiQuantile(Filterable, Reduction): arg: ibis_ops_core.Value num_bins: ibis_ops_core.Value[dt.Int64] - output_dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] + dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] __all__ = [ From 8fd8fd4ed9d2b1c4e50a8e99b8047aab990cca56 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Sep 2023 16:45:12 +0000 Subject: [PATCH 03/29] adjust type annotations --- bigframes/core/__init__.py | 3 +-- bigframes/operations/__init__.py | 13 ++++++++++--- bigframes/operations/aggregations.py | 4 ++-- bigframes/remote_function.py | 2 +- tests/system/small/test_ibis.py | 5 +++-- .../ibis/expr/operations/reductions.py | 2 +- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3b3754642e..d84ebd0b83 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -691,8 +691,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement + window_op = case_statement.else_(window_op).end() result = self._set_or_replace_by_id(output_name or column_name, window_op) # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index bc08298eb7..bc1b44eac3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -18,6 +18,7 @@ import typing import ibis +import ibis.common.annotations import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes import ibis.expr.operations.generic @@ -737,9 +738,15 @@ def add_op( ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc @short_circuit_nulls() diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 23271e8220..dd458c9e26 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -74,7 +74,7 @@ def _as_ibis( # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) return ( - ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() + ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore ) @@ -167,7 +167,7 @@ def _as_ibis( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) + return float_result.cast(column.type()) # type: ignore class MaxOp(AggregateOp): diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index fbfbc45e51..5c3f07db6b 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -491,7 +491,7 @@ def remote_function_node( ) } - fields["dtype"] = ibis_signature.output_type + fields["dtype"] = ibis_signature.output_type # type: ignore fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 58b78e0048..a8927d2f2b 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -26,8 +26,9 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str _, dataset, table_id = scalars_table_id.split(".") ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] - quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( # type: ignore - ibis_column, num_bins=num_bins + quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( + ibis_column, # type: ignore + num_bins=num_bins, # type: ignore ).to_expr() value = quantiles[1] num_edges = quantiles.length() diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 76aa34466a..e6644f477a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -15,7 +15,7 @@ class ApproximateMultiQuantile(Filterable, Reduction): arg: ibis_ops_core.Value num_bins: ibis_ops_core.Value[dt.Int64] - dtype: ibis_ops_core.Value[dt.Array[dt.Float64]] + dtype = dt.Array(dt.float64) __all__ = [ From 14fb7c8b80b712fda7d88bafda6a627283a130a6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Nov 2023 10:43:43 -0600 Subject: [PATCH 04/29] Update noxfile.py --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 14b00ed8a7..58e9a0bf97 100644 --- a/noxfile.py +++ b/noxfile.py @@ -85,7 +85,7 @@ "docs", "docfx", "unit_prerelease", - # "unit_noextras", + "unit_noextras", "system_prerelease", "doctest", "cover", From b1824539e33ce749a66fae49a8848e590fe74063 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 13 Nov 2023 22:30:33 +0000 Subject: [PATCH 05/29] update type annotations --- bigframes/core/compile/compiled.py | 2 +- bigframes/operations/__init__.py | 25 ++++++++++++++++++------- bigframes/operations/aggregations.py | 4 ++-- setup.py | 2 +- testing/constraints-3.9.txt | 4 ++-- tests/unit/resources.py | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 4ba5e6bd08..1e74e13cdd 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -862,7 +862,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() + case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 8c42b6c56a..ec3816ad11 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -353,14 +353,23 @@ def _as_ibis(self, x: ibis_types.Value): str_val = typing.cast(ibis_types.StringValue, x) # SQL pad operations will truncate, we do not want to truncate though. - pad_length = ibis.greatest(str_val.length(), self._length) + pad_length = typing.cast( + ibis_types.IntegerValue, ibis.greatest(str_val.length(), self._length) + ) if self._side == "left": return str_val.lpad(pad_length, self._fillchar) elif self._side == "right": return str_val.rpad(pad_length, self._fillchar) else: # side == both # Pad more on right side if can't pad both sides equally - lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + lpad_amount = typing.cast( + ibis_types.IntegerValue, + ( + (pad_length - str_val.length()) + // typing.cast(ibis_types.NumericValue, ibis.literal(2)) + ) + + str_val.length(), + ) return str_val.lpad(lpad_amount, self._fillchar).rpad( pad_length, self._fillchar ) @@ -1054,7 +1063,7 @@ def where_op( replacement: ibis_types.Value, ) -> ibis_types.Value: """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() + return ibis.case().when(condition, original).else_(replacement).end() # type: ignore def clip_op( @@ -1067,7 +1076,7 @@ def clip_op( not isinstance(upper, ibis_types.NullScalar) ): return ( - ibis.case() + ibis.case() # type: ignore .when(upper.isnull() | (original > upper), upper) .else_(original) .end() @@ -1076,7 +1085,7 @@ def clip_op( upper, ibis_types.NullScalar ): return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .else_(original) .end() @@ -1086,9 +1095,11 @@ def clip_op( ): return original else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + # Note: Pandas has unchanged behavior when upper bound and lower bound + # are flipped. + # This implementation requires that lower_bound < upper_bound. return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .when(upper.isnull() | (original > upper), upper) .else_(original) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f69d41afe6..363dfe819d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -290,7 +290,7 @@ def _as_ibis( dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), ) out = out.else_(None) - return out.end() + return out.end() # type: ignore @property def skips_nulls(self): @@ -482,7 +482,7 @@ def _map_to_literal( original: ibis_types.Value, literal: ibis_types.Scalar ) -> ibis_types.Column: # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') - return ibis.ifelse(original.isnull(), literal, literal) + return ibis.ifelse(original.isnull(), literal, literal) # type: ignore sum_op = SumOp() diff --git a/setup.py b/setup.py index 063cdb480d..1ad795466e 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.0.0dev,<8.0.0dev", + "ibis-framework[bigquery] >=7.0.0,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f43d3b4ca0..63e3b43cc4 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==6.2.0 +ibis-framework==7.0.0 humanize==4.6.0 identify==2.5.22 idna==3.4 @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==10.6.4 +sqlglot==18.7.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 8fc8acd175..6f74d1c8d9 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -80,7 +80,7 @@ def create_dataframe( # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({}, session=session) + return bigframes.dataframe.DataFrame({"col": []}, session=session) def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: From 0ab94cc33c1ea7b4f80f51ba018bf0368605b64f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:19:01 +0000 Subject: [PATCH 06/29] fix for deferred values --- bigframes/core/compile/compiled.py | 19 +++++++++++++++++-- bigframes/session/__init__.py | 6 ++++-- tests/system/small/test_ibis.py | 8 ++++++-- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 07bca1eca4..9593725396 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,6 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery +import ibis.common.deferred import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -62,7 +63,14 @@ def __init__( self._columns = tuple(columns) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -643,7 +651,14 @@ def __init__( # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } self._hidden_ordering_column_names = { column.get_name(): column for column in self._hidden_ordering_columns } diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 928123ce74..9211c4fa42 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -472,7 +472,8 @@ def _get_snapshot_sql_and_primary_key( ) table_expression = self.ibis_client.table( table_ref.table_id, - database=f"{table_ref.project}.{table_ref.dataset_id}", + schema=table_ref.dataset_id, + database=table_ref.project, ) # If there are primary keys defined, the query engine assumes these @@ -803,7 +804,8 @@ def _read_pandas( ) table_expression = self.ibis_client.table( load_table_destination.table_id, - database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", + schema=load_table_destination.dataset_id, + database=load_table_destination.project, ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index a8927d2f2b..86be81dc61 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -23,8 +23,12 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): num_bins = 3 ibis_client = session.ibis_client - _, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) + project, dataset, table_id = scalars_table_id.split(".") + ibis_table: ibis_types.Table = ibis_client.table( + table_id, + schema=dataset, + database=project, + ) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( ibis_column, # type: ignore From 4931923acd7596a5aa926a325ae041f8f3966a74 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:27:39 +0000 Subject: [PATCH 07/29] fix prerelease --- noxfile.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/noxfile.py b/noxfile.py index ec8642b5df..829d4d95ea 100644 --- a/noxfile.py +++ b/noxfile.py @@ -521,23 +521,11 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") - # TODO(shobs): - # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 - # introduced breaking change that removed the following: - # ibis.expr.rules.column - # ibis.expr.rules.value - # ibis.expr.rules.any - # Let's exclude ibis head from prerelease install list for now. Instead, use - # a working ibis-framework version resolved via setup.by (currently resolves - # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). - # We should enable the head back once bigframes support a version that - # includes the above commit. - # session.install( - # "--upgrade", - # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", - # ) - session.install("--no-deps", "ibis-framework==6.2.0") + session.install( + "--upgrade", + "-e", # Use -e so that py.typed file is included. + "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + ) already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 From 9df0816110521107a570c61a7ffcc930ae79e2a9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:42:53 +0000 Subject: [PATCH 08/29] minimum 7.1.0 --- bigframes/core/compile/compiled.py | 6 +++++- bigframes/session/__init__.py | 4 ++-- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/test_ibis.py | 2 +- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 9593725396..537d9c8b52 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,7 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery -import ibis.common.deferred +import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -66,6 +66,8 @@ def __init__( self._column_names = { ( column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. if isinstance(column, ibis.common.deferred.Deferred) else column ).get_name(): column @@ -654,6 +656,8 @@ def __init__( self._column_names = { ( column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. if isinstance(column, ibis.common.deferred.Deferred) else column ).get_name(): column diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9211c4fa42..6045caa899 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -470,7 +470,7 @@ def _get_snapshot_sql_and_primary_key( ), None, ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore table_ref.table_id, schema=table_ref.dataset_id, database=table_ref.project, @@ -802,7 +802,7 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, schema=load_table_destination.dataset_id, database=load_table_destination.project, diff --git a/setup.py b/setup.py index 1ad795466e..153ce79782 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.0.0,<8.0.0dev", + "ibis-framework[bigquery] >=7.1.0,<8.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 63e3b43cc4..011aafdee9 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==7.0.0 +ibis-framework==7.1.0 humanize==4.6.0 identify==2.5.22 idna==3.4 diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 86be81dc61..9fe1176068 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -24,7 +24,7 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str num_bins = 3 ibis_client = session.ibis_client project, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table( + ibis_table: ibis_types.Table = ibis_client.table( # type: ignore table_id, schema=dataset, database=project, From 71f48891ff04792037bc8bfdd21eb79c856109ba Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 20:48:06 +0000 Subject: [PATCH 09/29] mypy --- mypy.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mypy.ini b/mypy.ini index 901394813a..3809f8e241 100644 --- a/mypy.ini +++ b/mypy.ini @@ -24,5 +24,8 @@ ignore_missing_imports = True [mypy-pyarrow] ignore_missing_imports = True +[mypy-ibis.*] +ignore_missing_imports = True + [mypy-ipywidgets] ignore_missing_imports = True From 6c64ec59169d7b4e70850827ba0bc5cdd920e7ed Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 23:33:58 +0000 Subject: [PATCH 10/29] revert presubmit changes --- bigframes/session/__init__.py | 8 ++++---- noxfile.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6045caa899..a9e65dc171 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -472,8 +472,8 @@ def _get_snapshot_sql_and_primary_key( ) table_expression = self.ibis_client.table( # type: ignore table_ref.table_id, - schema=table_ref.dataset_id, - database=table_ref.project, + # TODO: use "dataset_id" as the "schema" + database=f"{table_ref.project}.{table_ref.dataset_id}", ) # If there are primary keys defined, the query engine assumes these @@ -804,8 +804,8 @@ def _read_pandas( ) table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, - schema=load_table_destination.dataset_id, - database=load_table_destination.project, + # TODO: use "dataset_id" as the "schema" + database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. diff --git a/noxfile.py b/noxfile.py index 829d4d95ea..0fe059580d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -84,9 +84,9 @@ "format", "docs", "docfx", - "unit_prerelease", + "unit", "unit_noextras", - "system_prerelease", + "system", "doctest", "cover", ] @@ -521,10 +521,18 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") + # Ibis has introduced breaking changes. Let's exclude ibis head + # from prerelease install list for now. We should enable the head back + # once bigframes supports the version at HEAD. + # session.install( + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", + # ) session.install( "--upgrade", - "-e", # Use -e so that py.typed file is included. - "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + "--pre", + "ibis-framework", ) already_installed.add("ibis-framework") From e37571c3771c1f0c2a47ea40f3206d224a9a08c2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2023 23:39:05 +0000 Subject: [PATCH 11/29] fix minimum sqlglot --- testing/constraints-3.9.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 011aafdee9..218255c77e 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==18.7.0 +sqlglot==18.12.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 From fad36a8cd48275b8701102309754bf0685707437 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 28 Nov 2023 18:41:06 +0000 Subject: [PATCH 12/29] fix custom op --- third_party/bigframes_vendored/ibis/expr/operations/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index dbb3fa3066..772c2e8ff4 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -6,4 +6,4 @@ class ToJsonString(Unary): - output_dtype = dt.string + dtype = dt.string From c318b186647290c71fb35bf5f4234f530f6b2780 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 14 Dec 2023 23:23:11 +0000 Subject: [PATCH 13/29] hack InMemoryTable formatter back in --- bigframes/session/__init__.py | 4 +- .../ibis/backends/bigquery/__init__.py | 3 + .../ibis/backends/bigquery/compiler.py | 59 +++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a794e709ff..fb5fab86ce 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -79,9 +79,9 @@ import bigframes.session.clients import bigframes.version -# Even though the ibis.backends.bigquery.registry import is unused, it's needed +# Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery # noqa import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index e69de29bb2..43508fab11 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -0,0 +1,3 @@ +# Import all sub-modules to monkeypatch everything. +import third_party.bigframes_vendored.ibis.backends.bigquery.compiler # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py new file mode 100644 index 0000000000..414f0a7c81 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py @@ -0,0 +1,59 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/compiler.py +"""Module to convert from Ibis expression to SQL string.""" + +from __future__ import annotations + +import re + +from ibis.backends.base.sql import compiler as sql_compiler +import ibis.backends.bigquery.compiler +from ibis.backends.bigquery.datatypes import BigQueryType +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops + +_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') +_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$") + + +class BigQueryTableSetFormatter(sql_compiler.TableSetFormatter): + def _quote_identifier(self, name): + """Restore 6.x version of identifier quoting. + + 7.x uses sqlglot which as of December 2023 doesn't know about the + extended unicode names for BigQuery yet. + """ + if _EXACT_NAME_REGEX.match(name) is not None: + return name + return f"`{name}`" + + def _format_in_memory_table(self, op): + """Restore 6.x version of InMemoryTable. + + BigQuery DataFrames explicitly uses InMemoryTable only when we know + the data is small enough to embed in SQL. + """ + schema = op.schema + names = schema.names + types = schema.types + + raw_rows = [] + for row in op.data.to_frame().itertuples(index=False): + raw_row = ", ".join( + f"{self._translate(lit)} AS {name}" + for lit, name in zip( + map(ops.Literal, row, types), map(self._quote_identifier, names) + ) + ) + raw_rows.append(f"STRUCT({raw_row})") + array_type = BigQueryType.from_ibis(dt.Array(op.schema.as_struct())) + + return f"UNNEST({array_type}[{', '.join(raw_rows)}])" + + +# Override implementation. +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._quote_identifier = ( + BigQueryTableSetFormatter._quote_identifier +) +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._format_in_memory_table = ( + BigQueryTableSetFormatter._format_in_memory_table +) From d3304b2897cd88097cb115287e05a30e88f490b8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 16:08:05 +0000 Subject: [PATCH 14/29] use ops module to avoid breaking changes if ops move around --- .../ibis/expr/operations/analytic.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 78aeb67a37..3d6a3b37b1 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,23 +2,21 @@ from __future__ import annotations -import ibis.expr.datatypes as dt -import ibis.expr.operations.analytic as ibis_ops_analytic -import ibis.expr.operations.core as ibis_ops_core +import ibis.expr.operations as ops import ibis.expr.rules as rlz -class FirstNonNullValue(ibis_ops_analytic.Analytic): +class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" - arg: ibis_ops_core.Column[dt.Any] + arg: ops.Column dtype = rlz.dtype_like("arg") -class LastNonNullValue(ibis_ops_analytic.Analytic): +class LastNonNullValue(ops.Analytic): """Retrieve the last element.""" - arg: ibis_ops_core.Column[dt.Any] + arg: ops.Column dtype = rlz.dtype_like("arg") From 33bd2e0a99e24c492646846ad22aaa0a160c45d4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 17:04:00 +0000 Subject: [PATCH 15/29] workaround nullscalar issue --- bigframes/operations/__init__.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index ec3816ad11..0655aafdb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -732,10 +732,29 @@ def ne_op( return x != y +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + def and_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( ibis_types.BooleanValue, y ) @@ -745,6 +764,17 @@ def or_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( ibis_types.BooleanValue, y ) @@ -756,7 +786,7 @@ def add_op( y: ibis_types.Value, ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return + return ibis.null() try: # Could be string concatenation or numeric addition. return x + y # type: ignore From c373dc0edda6a41739b1985b685d95612d7f7f27 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 17:40:14 +0000 Subject: [PATCH 16/29] update usage of percent_rank to explicitly order by the value --- bigframes/core/reshape/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index dc61c3baad..24c1bff309 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -18,6 +18,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe import bigframes.operations as ops @@ -145,7 +146,10 @@ def qcut( block, result = block.apply_window_op( x._value_column, agg_ops.QcutOp(q), - window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + window_spec=core.WindowSpec( + grouping_keys=(nullity_id,), + ordering=(order.OrderingColumnReference(x._value_column),), + ), ) block, result = block.apply_binary_op( result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label From 41364b591b14925ef113f7fd590b24a987059302 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 18:26:44 +0000 Subject: [PATCH 17/29] disable ibis prerelease tests for now --- noxfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/noxfile.py b/noxfile.py index 5416a29c9a..4b0f6cdb02 100644 --- a/noxfile.py +++ b/noxfile.py @@ -532,12 +532,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # "-e", # Use -e so that py.typed file is included. # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - session.install( - "--upgrade", - "--pre", - "ibis-framework", - ) - already_installed.add("ibis-framework") + # session.install( + # "--upgrade", + # "--pre", + # "ibis-framework", + # ) + # already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 session.install("--no-deps", "db-dtypes") From 7a8784c2c0bbad63e73de9c799223220230e8233 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 19:12:31 +0000 Subject: [PATCH 18/29] fix unit_prerelease --- noxfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/noxfile.py b/noxfile.py index 4b0f6cdb02..c0ec3b0c54 100644 --- a/noxfile.py +++ b/noxfile.py @@ -532,12 +532,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # "-e", # Use -e so that py.typed file is included. # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - # session.install( - # "--upgrade", - # "--pre", - # "ibis-framework", - # ) - # already_installed.add("ibis-framework") + session.install( + "--upgrade", + # "--pre", + "ibis-framework>=7.1.0,<8.0.0dev", + ) + already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 session.install("--no-deps", "db-dtypes") From 7b304a90315c05b1b05ba9aede235a3fd4f7fab3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 19:57:38 +0000 Subject: [PATCH 19/29] refactor: use ibis UDF functionality for remote_function --- bigframes/remote_function.py | 63 ++++++++++++++---------------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index f54c26fa56..6babd2590d 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -14,7 +14,6 @@ from __future__ import annotations -import functools import hashlib import inspect import logging @@ -28,6 +27,7 @@ import textwrap from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING +import ibis import requests if TYPE_CHECKING: @@ -43,12 +43,9 @@ resourcemanager_v3, ) import google.iam.v1 -from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type -import ibis.expr.operations as ops -import ibis.expr.rules as rlz from bigframes import clients import bigframes.constants as constants @@ -529,35 +526,6 @@ def get_remote_function_specs(self, remote_function_name): return (http_endpoint, bq_connection) -def remote_function_node( - routine_ref: bigquery.RoutineReference, ibis_signature: IbisSignature -): - """Creates an Ibis node representing a remote function call.""" - - fields = { - name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) - for name, type_ in zip( - ibis_signature.parameter_names, ibis_signature.input_types - ) - } - - fields["dtype"] = ibis_signature.output_type # type: ignore - fields["shape"] = rlz.shape_like("args") - - node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore - - @compiles(node) - def compile_node(t, op): - return "{}({})".format(node.__name__, ", ".join(map(t.translate, op.args))) - - def f(*args, **kwargs): - return node(*args, **kwargs).to_expr() - - f.bigframes_remote_function = str(routine_ref) # type: ignore - - return f - - class UnsupportedTypeError(ValueError): def __init__(self, type_, supported_types): self.type = type_ @@ -857,14 +825,18 @@ def wrapper(f): packages, ) - node = remote_function_node(dataset_ref.routine(rf_name), ibis_signature) - - node = functools.wraps(f)(node) - node.__signature__ = signature + node = ibis.udf.scalar.builtin( + f, + name=rf_name, + # TODO(swast): The backticks shouldn't be necessary. Ibis should be + # escaping the "schema" for us. + schema=f"`{dataset_ref.project}.{dataset_ref.dataset_id}`", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) node.bigframes_cloud_function = ( remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) - + node.bigframes_remote_function = str(dataset_ref.routine(rf_name)) # type: ignore return node return wrapper @@ -914,4 +886,17 @@ def read_gbq_function( f"{constants.FEEDBACK_LINK}" ) - return remote_function_node(routine_ref, ibis_signature) + def node(*args, **kwargs): + f"""Function {function_name}.""" + + node.__name__ = routine_ref.routine_id + node.bigframes_remote_function = str(routine_ref) # type: ignore + + return ibis.udf.scalar.builtin( + node, + name=routine_ref.routine_id, + # TODO(swast): The backticks shouldn't be necessary. Ibis should be + # escaping the "schema" for us. + schema=f"`{routine_ref.project}.{routine_ref.dataset_id}`", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) From b09c549052a4c5fe15ae7e609281bd4ca755d3be Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 21 Dec 2023 23:26:56 +0000 Subject: [PATCH 20/29] fix tests except for one remaining issue with deferred ordering id --- bigframes/core/compile/compiled.py | 15 +- bigframes/remote_function.py | 30 +-- noxfile.py | 2 +- setup.py | 3 +- tests/system/small/test_remote_function.py | 2 +- tests/unit/test_remote_function.py | 5 +- .../ibis/backends/bigquery/datatypes.py | 176 ++++++++++++++++++ 7 files changed, 203 insertions(+), 30 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 199c8db785..f2275fd05c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -1177,17 +1177,14 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) + # Make sure we don't have any unbound (deferred) columns. + table = self._table.select(columns) + # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type( - column.resolve(self._table) - # TODO(https://github.com/ibis-project/ibis/issues/7613): use - # public API to refer to Deferred type. - if isinstance(column, ibis.common.deferred.Deferred) - else column - ) - for column in columns + table = table.select( + bigframes.dtypes.ibis_value_to_canonical_type(table[column]) + for column in table.columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 6babd2590d..7979c95b33 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -43,12 +43,12 @@ resourcemanager_v3, ) import google.iam.v1 -from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type from bigframes import clients import bigframes.constants as constants +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes logger = logging.getLogger(__name__) @@ -184,12 +184,14 @@ def create_bq_remote_function( # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 bq_function_args = [] - bq_function_return_type = BigQueryType.from_ibis(output_type) + bq_function_return_type = third_party_ibis_bqtypes.BigQueryType.from_ibis( + output_type + ) # We are expecting the input type annotations to be 1:1 with the input args for idx, name in enumerate(input_args): bq_function_args.append( - f"{name} {BigQueryType.from_ibis(input_types[idx])}" + f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}" ) create_function_ddl = f""" CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) @@ -541,7 +543,7 @@ def ibis_type_from_python_type(t: type) -> IbisDataType: def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> IbisDataType: if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return BigQueryType.to_ibis(tk) + return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) def ibis_signature_from_python_signature( @@ -828,9 +830,7 @@ def wrapper(f): node = ibis.udf.scalar.builtin( f, name=rf_name, - # TODO(swast): The backticks shouldn't be necessary. Ibis should be - # escaping the "schema" for us. - schema=f"`{dataset_ref.project}.{dataset_ref.dataset_id}`", + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", signature=(ibis_signature.input_types, ibis_signature.output_type), ) node.bigframes_cloud_function = ( @@ -886,17 +886,17 @@ def read_gbq_function( f"{constants.FEEDBACK_LINK}" ) - def node(*args, **kwargs): - f"""Function {function_name}.""" + # The name "args" conflicts with the Ibis operator, so we use + # non-standard names for the arguments here. + def node(*ignored_args, **ignored_kwargs): + f"""Remote function {str(routine_ref)}.""" node.__name__ = routine_ref.routine_id - node.bigframes_remote_function = str(routine_ref) # type: ignore - - return ibis.udf.scalar.builtin( + node = ibis.udf.scalar.builtin( node, name=routine_ref.routine_id, - # TODO(swast): The backticks shouldn't be necessary. Ibis should be - # escaping the "schema" for us. - schema=f"`{routine_ref.project}.{routine_ref.dataset_id}`", + schema=f"{routine_ref.project}.{routine_ref.dataset_id}", signature=(ibis_signature.input_types, ibis_signature.output_type), ) + node.bigframes_remote_function = str(routine_ref) # type: ignore + return node diff --git a/noxfile.py b/noxfile.py index c4bbd7a65a..6911e1541e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -535,7 +535,7 @@ def prerelease(session: nox.sessions.Session, tests_path): session.install( "--upgrade", # "--pre", - "ibis-framework>=7.1.0,<7.2.0dev", + "ibis-framework>=7.2.0,<8.0.0dev", ) already_installed.add("ibis-framework") diff --git a/setup.py b/setup.py index 345d1ea752..8c414b301b 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - # TODO: Relax upper bound once we have fixed unit tests with 7.2.0. - "ibis-framework[bigquery] >=7.1.0,<7.2.0dev", + "ibis-framework[bigquery] >=7.2.0,<8.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index a98056d82a..933bc6696d 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -534,7 +534,7 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): +def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id, scalars_df_index): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index 540f4020d3..e9ea5a3d14 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -12,17 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ibis.backends.bigquery import datatypes as bq_types from ibis.expr import datatypes as ibis_types from bigframes import remote_function as rf +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes def test_supported_types_correspond(): # The same types should be representable by the supported Python and BigQuery types. ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) + for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py new file mode 100644 index 0000000000..e7200cbf2a --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py @@ -0,0 +1,176 @@ +# Contains code from +# https://github.com/ibis-project/ibis/blob/697d325f13bdf2746a50e86204eb8834b1710bd6/ibis/backends/bigquery/datatypes.py + +from __future__ import annotations + +import google.cloud.bigquery as bq +import ibis +import ibis.expr.datatypes as dt +import ibis.expr.schema as sch +from ibis.formats import SchemaMapper, TypeMapper +import sqlglot as sg + +_from_bigquery_types = { + "INT64": dt.Int64, + "INTEGER": dt.Int64, + "FLOAT": dt.Float64, + "FLOAT64": dt.Float64, + "BOOL": dt.Boolean, + "BOOLEAN": dt.Boolean, + "STRING": dt.String, + "DATE": dt.Date, + "TIME": dt.Time, + "BYTES": dt.Binary, + "JSON": dt.JSON, +} + + +class BigQueryType(TypeMapper): + @classmethod + def to_ibis(cls, typ: str, nullable: bool = True) -> dt.DataType: + if typ == "DATETIME": + return dt.Timestamp(timezone=None, nullable=nullable) + elif typ == "TIMESTAMP": + return dt.Timestamp(timezone="UTC", nullable=nullable) + elif typ == "NUMERIC": + return dt.Decimal(38, 9, nullable=nullable) + elif typ == "BIGNUMERIC": + return dt.Decimal(76, 38, nullable=nullable) + elif typ == "GEOGRAPHY": + return dt.GeoSpatial(geotype="geography", srid=4326, nullable=nullable) + else: + try: + return _from_bigquery_types[typ](nullable=nullable) + except KeyError: + raise TypeError(f"Unable to convert BigQuery type to ibis: {typ}") + + @classmethod + def from_ibis(cls, dtype: dt.DataType) -> str: + if dtype.is_floating(): + return "FLOAT64" + elif dtype.is_uint64(): + raise TypeError( + "Conversion from uint64 to BigQuery integer type (int64) is lossy" + ) + elif dtype.is_integer(): + return "INT64" + elif dtype.is_binary(): + return "BYTES" + elif dtype.is_date(): + return "DATE" + elif dtype.is_timestamp(): + if dtype.timezone is None: + return "DATETIME" + elif dtype.timezone == "UTC": + return "TIMESTAMP" + else: + raise TypeError( + "BigQuery does not support timestamps with timezones other than 'UTC'" + ) + elif dtype.is_decimal(): + if (dtype.precision, dtype.scale) == (76, 38): + return "BIGNUMERIC" + if (dtype.precision, dtype.scale) in [(38, 9), (None, None)]: + return "NUMERIC" + raise TypeError( + "BigQuery only supports decimal types with precision of 38 and " + f"scale of 9 (NUMERIC) or precision of 76 and scale of 38 (BIGNUMERIC). " + f"Current precision: {dtype.precision}. Current scale: {dtype.scale}" + ) + elif dtype.is_array(): + return f"ARRAY<{cls.from_ibis(dtype.value_type)}>" + elif dtype.is_struct(): + fields = ( + f"{sg.to_identifier(k).sql('bigquery')} {cls.from_ibis(v)}" + for k, v in dtype.fields.items() + ) + return "STRUCT<{}>".format(", ".join(fields)) + elif dtype.is_json(): + return "JSON" + elif dtype.is_geospatial(): + if (dtype.geotype, dtype.srid) == ("geography", 4326): + return "GEOGRAPHY" + raise TypeError( + "BigQuery geography uses points on WGS84 reference ellipsoid." + f"Current geotype: {dtype.geotype}, Current srid: {dtype.srid}" + ) + elif dtype.is_map(): + raise NotImplementedError("Maps are not supported in BigQuery") + else: + return str(dtype).upper() + + +class BigQuerySchema(SchemaMapper): + @classmethod + def from_ibis(cls, schema: sch.Schema) -> list[bq.SchemaField]: + schema_fields = [] + + for name, typ in ibis.schema(schema).items(): + if typ.is_array(): + value_type = typ.value_type + if value_type.is_array(): + raise TypeError("Nested arrays are not supported in BigQuery") + + is_struct = value_type.is_struct() + + field_type = ( + "RECORD" if is_struct else BigQueryType.from_ibis(typ.value_type) + ) + mode = "REPEATED" + fields = cls.from_ibis(ibis.schema(getattr(value_type, "fields", {}))) + elif typ.is_struct(): + field_type = "RECORD" + mode = "NULLABLE" if typ.nullable else "REQUIRED" + fields = cls.from_ibis(ibis.schema(typ.fields)) + else: + field_type = BigQueryType.from_ibis(typ) + mode = "NULLABLE" if typ.nullable else "REQUIRED" + fields = [] + + schema_fields.append( + bq.SchemaField(name, field_type=field_type, mode=mode, fields=fields) + ) + return schema_fields + + @classmethod + def _dtype_from_bigquery_field(cls, field: bq.SchemaField) -> dt.DataType: + typ = field.field_type + if typ == "RECORD": + assert field.fields, "RECORD fields are empty" + fields = {f.name: cls._dtype_from_bigquery_field(f) for f in field.fields} + dtype = dt.Struct(fields) + else: + dtype = BigQueryType.to_ibis(typ) + + mode = field.mode + if mode == "NULLABLE": + return dtype.copy(nullable=True) + elif mode == "REQUIRED": + return dtype.copy(nullable=False) + elif mode == "REPEATED": + # arrays with NULL elements aren't supported + return dt.Array(dtype.copy(nullable=False)) + else: + raise TypeError(f"Unknown BigQuery field.mode: {mode}") + + @classmethod + def to_ibis(cls, fields: list[bq.SchemaField]) -> sch.Schema: + return sch.Schema({f.name: cls._dtype_from_bigquery_field(f) for f in fields}) + + +# TODO(kszucs): we can eliminate this function by making dt.DataType traversible +# using ibis.common.graph.Node, similarly to how we traverse ops.Node instances: +# node.find(types) +def spread_type(dt: dt.DataType): + """Returns a generator that contains all the types in the given type. + + For complex types like set and array, it returns the types of the elements. + """ + if dt.is_array(): + yield from spread_type(dt.value_type) + elif dt.is_struct(): + for type_ in dt.types: + yield from spread_type(type_) + elif dt.is_map(): + raise NotImplementedError("Maps are not supported in BigQuery") + yield dt From a142b29d5385880bb57973a7c7250d85ad8cc7a1 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 2 Jan 2024 21:00:12 +0000 Subject: [PATCH 21/29] use unordered mode with pandas backend in unit tests --- tests/unit/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 623448b3aa..6c880fd584 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered") + actual = expr._compile_unordered()._to_ibis_expr() assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" From aaeabd43fb80659111226ce70463eb278e0a323d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 2 Jan 2024 21:37:21 +0000 Subject: [PATCH 22/29] fix constraints --- testing/constraints-3.9.txt | 2 +- tests/system/small/test_remote_function.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 218255c77e..a5cba4cba0 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==7.1.0 +ibis-framework==7.2.0 humanize==4.6.0 identify==2.5.22 idna==3.4 diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 933bc6696d..a98056d82a 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -534,7 +534,7 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id, scalars_df_index): +def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", From 63cc0f3d8429ba9b764ad137cc91769a1148a277 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 26 Feb 2024 19:26:16 +0000 Subject: [PATCH 23/29] try ibis 8.0.0 --- setup.py | 4 ++-- testing/constraints-3.9.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 9b8483af66..5c7ff8bb7e 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.2.0,<8.0.0dev", + "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", @@ -54,7 +54,7 @@ # Keep sqlglot versions in sync with ibis-framework. This avoids problems # where the incorrect version of sqlglot is installed, such as # https://github.com/googleapis/python-bigquery-dataframes/issues/315 - "sqlglot >=19.9.0,<20", + "sqlglot >=19.9.0", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index def923436a..3617c218bb 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -10,7 +10,7 @@ google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 -ibis-framework==7.2.0 +ibis-framework==8.0.0 pandas==1.5.0 pydata-google-auth==1.8.2 requests==2.27.1 From c434824e93c2ac027cbe9fdf6cb349d5d7ed2a93 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 26 Feb 2024 19:43:47 +0000 Subject: [PATCH 24/29] fix unit tests --- tests/unit/test_remote_function.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index 5786968845..629bc5326a 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -15,7 +15,6 @@ from ibis.expr import datatypes as ibis_types import bigframes.dtypes -from bigframes.functions import remote_function as rf import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes @@ -26,7 +25,7 @@ def test_supported_types_correspond(): } ibis_types_from_bigquery = { third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) - for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + for tk in bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS } assert ibis_types_from_python == ibis_types_from_bigquery From 2bd257312a5679a9a9f2c0291c1ee654bb6641c6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 26 Feb 2024 22:55:07 +0000 Subject: [PATCH 25/29] fix tests --- bigframes/dtypes.py | 4 ++-- bigframes/functions/remote_function.py | 17 ++--------------- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6e3bc25c47..8a2055ef7f 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -23,7 +23,6 @@ import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import ibis -from ibis.backends.bigquery.datatypes import BigQueryType import ibis.expr.datatypes as ibis_dtypes from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type import ibis.expr.types as ibis_types @@ -33,6 +32,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame @@ -643,4 +643,4 @@ def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return BigQueryType.to_ibis(tk) + return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b5b6f7ec7d..5fe203feb3 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -44,7 +44,6 @@ ) import google.iam.v1 from ibis.expr.datatypes.core import DataType as IbisDataType -from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type from bigframes import clients import bigframes.constants as constants @@ -521,18 +520,6 @@ def __init__(self, type_, supported_types): self.supported_types = supported_types -def ibis_type_from_python_type(t: type) -> IbisDataType: - if t not in bigframes.dtypes.SUPPORTED_IO_PYTHON_TYPES: - raise UnsupportedTypeError(t, bigframes.dtypes.SUPPORTED_IO_PYTHON_TYPES) - return python_type_to_bigquery_type(t) - - -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> IbisDataType: - if tk not in bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS: - raise UnsupportedTypeError(tk, bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) - - def ibis_signature_from_python_signature( signature: inspect.Signature, input_types: Sequence[type], @@ -823,7 +810,7 @@ def wrapper(f): node = ibis.udf.scalar.builtin( f, name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + schema=f"`{dataset_ref.project}.{dataset_ref.dataset_id}`", signature=(ibis_signature.input_types, ibis_signature.output_type), ) node.bigframes_cloud_function = ( @@ -888,7 +875,7 @@ def node(*ignored_args, **ignored_kwargs): node = ibis.udf.scalar.builtin( node, name=routine_ref.routine_id, - schema=f"{routine_ref.project}.{routine_ref.dataset_id}", + schema=f"`{routine_ref.project}.{routine_ref.dataset_id}`", signature=(ibis_signature.input_types, ibis_signature.output_type), ) node.bigframes_remote_function = str(routine_ref) # type: ignore diff --git a/setup.py b/setup.py index 5c7ff8bb7e..b6bcca4c7d 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ # Keep sqlglot versions in sync with ibis-framework. This avoids problems # where the incorrect version of sqlglot is installed, such as # https://github.com/googleapis/python-bigquery-dataframes/issues/315 - "sqlglot >=19.9.0", + "sqlglot >=20.8.0", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 3617c218bb..c4fed64fbd 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -16,7 +16,7 @@ pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 sqlalchemy==1.4 -sqlglot==19.9.0 +sqlglot==20.8.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 From 24038f161f5f9d5d8bbf2752779bf4656254140d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 27 Feb 2024 17:01:33 +0000 Subject: [PATCH 26/29] fix unit_prerelease tests --- noxfile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index 4215eb46df..91d26cf695 100644 --- a/noxfile.py +++ b/noxfile.py @@ -565,12 +565,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # session.install( # "--upgrade", # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", + # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", # ) session.install( "--upgrade", - # "--pre", - "ibis-framework>=7.2.0,<8.0.0dev", + "--pre", + "ibis-framework>=8.0.0,<9.0.0dev", ) already_installed.add("ibis-framework") From 56ca0946e0928406bf83bffdb6689c1824bbd7d6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 27 Feb 2024 17:05:42 +0000 Subject: [PATCH 27/29] fix remote function tests --- bigframes/functions/remote_function.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 5fe203feb3..af4c4b138a 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -810,7 +810,7 @@ def wrapper(f): node = ibis.udf.scalar.builtin( f, name=rf_name, - schema=f"`{dataset_ref.project}.{dataset_ref.dataset_id}`", + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", signature=(ibis_signature.input_types, ibis_signature.output_type), ) node.bigframes_cloud_function = ( @@ -875,7 +875,7 @@ def node(*ignored_args, **ignored_kwargs): node = ibis.udf.scalar.builtin( node, name=routine_ref.routine_id, - schema=f"`{routine_ref.project}.{routine_ref.dataset_id}`", + schema=f"{routine_ref.project}.{routine_ref.dataset_id}", signature=(ibis_signature.input_types, ibis_signature.output_type), ) node.bigframes_remote_function = str(routine_ref) # type: ignore From d4e8d51b0d71d5b3f268efebeb2a2a401175b7d3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 27 Feb 2024 18:12:48 +0000 Subject: [PATCH 28/29] fix nlargest/nsmallest --- bigframes/core/compile/aggregate_compiler.py | 2 +- tests/system/small/test_dataframe.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 1dad128599..86ba16e347 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -331,7 +331,7 @@ def _( op: agg_ops.RankOp, column: ibis_types.Column, window=None ) -> ibis_types.IntegerValue: # Ibis produces 0-based ranks, while pandas creates 1-based ranks - return _apply_window_if_present(column.rank(), window) + 1 + return _apply_window_if_present(ibis.rank(), window) + 1 @compile_unary_agg.register diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8f75534fc6..9f4e138b73 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -157,15 +157,13 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde ], ) def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nlargest( - 3, ["bool_col", "int64_too"], keep=keep - ).to_pandas() + bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) pd_result = scalars_pandas_df_index.nlargest( 3, ["bool_col", "int64_too"], keep=keep ) pd.testing.assert_frame_equal( - bf_result, + bf_result.to_pandas(), pd_result, ) @@ -179,11 +177,11 @@ def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): ], ) def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep).to_pandas() + bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) pd.testing.assert_frame_equal( - bf_result, + bf_result.to_pandas(), pd_result, ) From 54df52b135d1d8436850c281f8fef2d79a9e9297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=2C=20formerly=29?= Date: Tue, 27 Feb 2024 16:40:00 -0600 Subject: [PATCH 29/29] synchronize max sqlglot with ibis --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b6bcca4c7d..516d5b8a19 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ # Keep sqlglot versions in sync with ibis-framework. This avoids problems # where the incorrect version of sqlglot is installed, such as # https://github.com/googleapis/python-bigquery-dataframes/issues/315 - "sqlglot >=20.8.0", + "sqlglot >=20.8.0,<=20.11", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0",