From 0e169215b132df5a9c142e040ac168fb53bfb0f2 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 9 Nov 2023 21:31:58 +0000 Subject: [PATCH 01/12] feat: read_gbq creates order deterministically without table copy --- bigframes/session/__init__.py | 354 ++++++++++++---------------------- 1 file changed, 126 insertions(+), 228 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8f9fa37787..45bf33c6fb 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -17,10 +17,10 @@ from __future__ import annotations import datetime +import itertools import logging import os import re -import textwrap import typing from typing import ( Any, @@ -597,8 +597,6 @@ def _read_gbq_table( else: index_cols = list(index_col) - hidden_cols: typing.Sequence[str] = () - for key in index_cols: if key not in table_expression.columns: raise ValueError( @@ -608,13 +606,10 @@ def _read_gbq_table( # If the index is unique and sortable, then we don't need to generate # an ordering column. ordering = None - is_total_ordering = False - if total_ordering_cols is not None: # Note: currently, this a table has a total ordering only when the # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. - is_total_ordering = True ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( core.OrderingColumnReference(column_id) @@ -622,41 +617,17 @@ def _read_gbq_table( ), total_ordering_columns=frozenset(total_ordering_cols), ) - - if len(index_cols) != 0: - index_labels = typing.cast(List[Optional[str]], index_cols) - else: - # Use the total_ordering_cols to project offsets to use as the default index. - table_expression = table_expression.order_by(index_cols) - default_index_id = guid.generate_guid("bigframes_index_") - default_index_col = ( - ibis.row_number().cast(ibis_dtypes.int64).name(default_index_id) - ) - table_expression = table_expression.mutate( - **{default_index_id: default_index_col} - ) - index_cols = [default_index_id] - index_labels = [None] - elif len(index_cols) != 0: - index_labels = typing.cast(List[Optional[str]], index_cols) - distinct_table = table_expression.select(*index_cols).distinct() - is_unique_sql = f"""WITH full_table AS ( - {self.ibis_client.compile(table_expression)} - ), - distinct_table AS ( - {self.ibis_client.compile(distinct_table)} + column_values = [table_expression[col] for col in table_expression.columns] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[], + ordering=ordering, ) - SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, - (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` - """ - results, query_job = self._start_query(is_unique_sql) - row = next(iter(results)) - - total_count = row["total_count"] - distinct_count = row["distinct_count"] - is_total_ordering = total_count == distinct_count - + elif len(index_cols) != 0: + # We have index columns, lets see if those are actually total_order_columns ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( [ @@ -666,142 +637,59 @@ def _read_gbq_table( ), total_ordering_columns=frozenset(index_cols), ) - - # We have a total ordering, so query via "time travel" so that - # the underlying data doesn't mutate. + is_total_ordering = self._check_index_uniqueness( + table_expression, index_cols + ) if is_total_ordering: - # Get the timestamp from the job metadata rather than the query - # text so that the query for determining uniqueness of the ID - # columns can be cached. - current_timestamp = query_job.started - - # The job finished, so we should have a start time. - assert current_timestamp is not None - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + column_values = [ + table_expression[col] for col in table_expression.columns + ] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[], + ordering=ordering, ) else: - # Make sure when we generate an ordering, the row_number() - # coresponds to the index columns. - table_expression = table_expression.order_by(index_cols) - warnings.warn( - textwrap.dedent( - f""" - Got a non-unique index. A consistent ordering is not - guaranteed. DataFrame has {total_count} rows, - but only {distinct_count} distinct index values. - """, - ) - ) - - # When ordering by index columns, apply limit after ordering to - # make limit more predictable. - if max_results is not None: - table_expression = table_expression.limit(max_results) + array_value = self._create_total_ordering(table_expression) else: - if max_results is not None: - # Apply limit before generating rownums and creating temp table - # This makes sure the offsets are valid and limits the number of - # rows for which row numbers must be generated - table_expression = table_expression.limit(max_results) - table_expression, ordering = self._create_sequential_ordering( - table=table_expression, - api_name=api_name, - ) - hidden_cols = ( - (ordering.total_order_col.column_id,) - if ordering.total_order_col - else () - ) - assert len(ordering.ordering_value_columns) > 0 - is_total_ordering = True - # Block constructor will generate default index if passed empty - index_cols = [] - index_labels = [] - - return self._read_gbq_with_ordering( - table_expression=table_expression, - col_order=col_order, - index_cols=index_cols, - index_labels=index_labels, - hidden_cols=hidden_cols, - ordering=ordering, - is_total_ordering=is_total_ordering, - api_name=api_name, - ) + array_value = self._create_total_ordering(table_expression) - def _read_gbq_with_ordering( - self, - table_expression: ibis_types.Table, - *, - col_order: Iterable[str] = (), - col_labels: Iterable[Optional[str]] = (), - index_cols: Iterable[str] = (), - index_labels: Iterable[Optional[str]] = (), - hidden_cols: Iterable[str] = (), - ordering: orderings.ExpressionOrdering, - is_total_ordering: bool = False, - api_name: str, - ) -> dataframe.DataFrame: - """Internal helper method that loads DataFrame from Google BigQuery given an ordering column. + if col_order: + array_value = array_value.select_columns(tuple(col_order)) - Args: - table_expression: - an ibis table expression to be executed in BigQuery. - col_order: - List of BigQuery column ids in the desired order for results DataFrame. - col_labels: - List of column labels as the column names. - index_cols: - List of index ids to use as the index or multi-index. - index_labels: - List of index labels as names of index. - hidden_cols: - Columns that should be hidden. Ordering columns may (not always) be hidden - ordering: - Column name to be used for ordering. If not supplied, a default ordering is generated. - api_name: - The name of the API method. + value_columns = [col for col in array_value.column_ids if col not in index_cols] + block = blocks.Block( + array_value, + index_columns=index_cols, + column_labels=value_columns, + index_labels=index_cols, + ) + if max_results: + block = block.slice(stop=max_results) + return dataframe.DataFrame(block) - Returns: - A DataFrame representing results of the query or table. - """ - index_cols, index_labels = list(index_cols), list(index_labels) - if len(index_cols) != len(index_labels): - raise ValueError( - "Needs same number of index labels are there are index columns. " - f"Got {len(index_labels)}, expected {len(index_cols)}." - ) + def _check_index_uniqueness( + self, table: ibis_types.Table, index_cols: List[str] + ) -> bool: + distinct_table = table.select(*index_cols).distinct() + is_unique_sql = f"""WITH full_table AS ( + {self.ibis_client.compile(table)} + ), + distinct_table AS ( + {self.ibis_client.compile(distinct_table)} + ) - # Logic: - # no total ordering, index -> create sequential order, ordered by index, use for both ordering and index - # total ordering, index -> use ordering as ordering, index as index + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` + """ + results, _ = self._start_query(is_unique_sql) + row = next(iter(results)) - # This code block ensures the existence of a total ordering. - column_keys = list(col_order) - if len(column_keys) == 0: - non_value_columns = set([*index_cols, *hidden_cols]) - column_keys = [ - key for key in table_expression.columns if key not in non_value_columns - ] - if not is_total_ordering: - # Rows are not ordered, we need to generate a default ordering and materialize it - table_expression, ordering = self._create_sequential_ordering( - table=table_expression, - index_cols=index_cols, - api_name=api_name, - ) - index_col_values = [table_expression[index_id] for index_id in index_cols] - if not col_labels: - col_labels = column_keys - return self._read_ibis( - table_expression, - index_col_values, - index_labels, - column_keys, - col_labels, - ordering=ordering, - ) + total_count = row["total_count"] + distinct_count = row["distinct_count"] + return total_count == distinct_count def _read_bigquery_load_job( self, @@ -845,40 +733,6 @@ def _read_bigquery_load_job( col_order=col_order, ) - def _read_ibis( - self, - table_expression: ibis_types.Table, - index_cols: Iterable[ibis_types.Value], - index_labels: Iterable[blocks.Label], - column_keys: Iterable[str], - column_labels: Iterable[blocks.Label], - ordering: orderings.ExpressionOrdering, - ) -> dataframe.DataFrame: - """Turns a table expression (plus index column) into a DataFrame.""" - - columns = list(index_cols) - for key in column_keys: - if key not in table_expression.columns: - raise ValueError(f"Column '{key}' not found in this table.") - columns.append(table_expression[key]) - - non_hidden_ids = [col.get_name() for col in columns] - hidden_ordering_columns = [] - for ref in ordering.all_ordering_columns: - if ref.column_id not in non_hidden_ids: - hidden_ordering_columns.append(table_expression[ref.column_id]) - - block = blocks.Block( - core.ArrayValue.from_ibis( - self, table_expression, columns, hidden_ordering_columns, ordering - ), - index_columns=[index_col.get_name() for index_col in index_cols], - column_labels=column_labels, - index_labels=index_labels, - ) - - return dataframe.DataFrame(block) - def read_gbq_model(self, model_name: str): """Loads a BigQuery ML model from BigQuery. @@ -1000,17 +854,26 @@ def _read_pandas( ): new_idx_ids, idx_labels = [], [] - df = self._read_gbq_with_ordering( - table_expression=table_expression, - col_labels=col_labels, - index_cols=new_idx_ids, - index_labels=idx_labels, - hidden_cols=(ordering_col,), + column_values = [ + table_expression[col] + for col in table_expression.columns + if col != ordering_col + ] + array_value = core.ArrayValue.from_ibis( + self, + table_expression, + columns=column_values, + hidden_ordering_columns=[table_expression[ordering_col]], ordering=ordering, - is_total_ordering=True, - api_name=api_name, ) - return df + + block = blocks.Block( + array_value, + index_columns=new_idx_ids, + column_labels=col_labels, + index_labels=idx_labels, + ) + return dataframe.DataFrame(block) def read_csv( self, @@ -1297,34 +1160,51 @@ def _create_empty_temp_table( ) return bigquery.TableReference.from_string(table) - def _create_sequential_ordering( + def _create_total_ordering( self, table: ibis_types.Table, - index_cols: Iterable[str] = (), - api_name: str = "", - ) -> Tuple[ibis_types.Table, orderings.ExpressionOrdering]: + ) -> core.ArrayValue: # Since this might also be used as the index, don't use the default # "ordering ID" name. - default_ordering_name = guid.generate_guid("bigframes_ordering_") - default_ordering_col = ( - ibis.row_number().cast(ibis_dtypes.int64).name(default_ordering_name) + ordering_hash_part = guid.generate_guid("bigframes_ordering_") + ordering_rand_part = guid.generate_guid("bigframes_ordering_") + + str_values = list( + map(lambda col: _convert_to_string(table[col]), table.columns) ) - table = table.mutate(**{default_ordering_name: default_ordering_col}) - table_ref = self._ibis_to_session_table( - table, - cluster_cols=list(index_cols) + [default_ordering_name], - api_name=api_name, + full_row_str = ( + str_values[0].concat(*str_values[1:]) + if len(str_values) > 1 + else str_values[0] ) - table = self.ibis_client.table( - f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" + full_row_hash = full_row_str.hash().name(ordering_hash_part) + # Used to disambiguate between identical rows (which will have identical hash) + random_value = ibis.random().name(ordering_rand_part) + + # Might need to actually reselect here? + original_column_ids = table.columns + table_with_ordering = table.select( + itertools.chain(original_column_ids, [full_row_hash, random_value]) ) - ordering_reference = core.OrderingColumnReference(default_ordering_name) + + ordering_ref1 = core.OrderingColumnReference(ordering_hash_part) + ordering_ref2 = core.OrderingColumnReference(ordering_rand_part) ordering = orderings.ExpressionOrdering( - ordering_value_columns=tuple([ordering_reference]), - total_ordering_columns=frozenset([default_ordering_name]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ordering_value_columns=(ordering_ref1, ordering_ref2), + total_ordering_columns=frozenset([ordering_hash_part, ordering_rand_part]), + ) + columns = [table_with_ordering[col] for col in original_column_ids] + hidden_columns = [ + table_with_ordering[ordering_hash_part], + table_with_ordering[ordering_rand_part], + ] + return core.ArrayValue.from_ibis( + self, + table_with_ordering, + columns, + hidden_ordering_columns=hidden_columns, + ordering=ordering, ) - return table, ordering def _ibis_to_session_table( self, @@ -1559,3 +1439,21 @@ def _can_cluster_bq(field: bigquery.SchemaField): "BOOL", "BOOLEAN", ) + + +def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: + # Some of these probably don't work + col_type = column.type() + if col_type.is_array(): + result = column.cast(ibis_dtypes.String(nullable=True)) + elif col_type.is_struct(): + result = column.cast(ibis_dtypes.String(nullable=True)) + elif col_type.is_geospatial(): + result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() + elif col_type.is_json(): + result = column.cast(ibis_dtypes.String(nullable=True)) + elif col_type.is_string(): + result = column + else: + result = column.cast(ibis_dtypes.String(nullable=True)) + return typing.cast(ibis_types.StringColumn, result) From 30f7cd1294df28fabed59414b2b705bb56b4ef6c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 00:23:16 +0000 Subject: [PATCH 02/12] add to_json_string ibis op as monkey patch --- bigframes/session/__init__.py | 9 +++------ .../ibis/backends/bigquery/registry.py | 12 +++++++++--- .../ibis/expr/operations/__init__.py | 5 +++-- .../bigframes_vendored/ibis/expr/operations/json.py | 9 +++++++++ 4 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/expr/operations/json.py diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 45bf33c6fb..7b7b8b95c7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -82,6 +82,7 @@ # Even though the ibis.backends.bigquery.registry import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet import third_party.bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers @@ -1444,14 +1445,10 @@ def _can_cluster_bq(field: bigquery.SchemaField): def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: # Some of these probably don't work col_type = column.type() - if col_type.is_array(): - result = column.cast(ibis_dtypes.String(nullable=True)) - elif col_type.is_struct(): - result = column.cast(ibis_dtypes.String(nullable=True)) + if col_type.is_array() or col_type.is_struct(): + result = vendored_ibis_ops.ToJsonString(column).to_expr() elif col_type.is_geospatial(): result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() - elif col_type.is_json(): - result = column.cast(ibis_dtypes.String(nullable=True)) elif col_type.is_string(): result = column else: diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index a4e61ca0f9..e1b28690d7 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -22,10 +22,16 @@ def _last_non_null_value(translator, op: vendored_ibis_ops.LastNonNullValue): return f"LAST_VALUE({arg} IGNORE NULLS)" +def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString): + arg = translator.translate(op.arg) + return f"TO_JSON_STRING({arg})" + + patched_ops = { - vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, - vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, - vendored_ibis_ops.LastNonNullValue: _last_non_null_value, + vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore + vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore + vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore + vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 1612d9c12e..8219701392 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -1,5 +1,6 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py from __future__ import annotations -from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F403 -from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F403 +from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 +from third_party.bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 +from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py new file mode 100644 index 0000000000..dbb3fa3066 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -0,0 +1,9 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/json.py +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.core import Unary + + +class ToJsonString(Unary): + output_dtype = dt.string From 16e8c6e065a3f18c3dacd74a27715b6a85b2c939 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 00:42:49 +0000 Subject: [PATCH 03/12] explicitly enforce region in read_gbq in session --- bigframes/session/__init__.py | 17 ++++++++++++++--- tests/system/small/test_pandas_options.py | 4 ++-- tests/system/small/test_progress_bar.py | 8 -------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7b7b8b95c7..6c0f71858f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -510,6 +510,7 @@ def _read_gbq_table_to_ibis_with_total_ordering( table_ref: bigquery.table.TableReference, *, api_name: str, + enforce_region: bool = False, ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]: """Create a read-only Ibis table expression representing a table. @@ -536,6 +537,12 @@ def _read_gbq_table_to_ibis_with_total_ordering( # the same assumption and use these columns as the total ordering keys. table = self.bqclient.get_table(table_ref) + if enforce_region: + if table.location != self._location: + raise ValueError( + f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" + ) + # TODO(b/305264153): Use public properties to fetch primary keys once # added to google-cloud-bigquery. primary_keys = ( @@ -583,8 +590,7 @@ def _read_gbq_table( table_expression, total_ordering_cols, ) = self._read_gbq_table_to_ibis_with_total_ordering( - table_ref, - api_name=api_name, + table_ref, api_name=api_name, enforce_region=True ) for key in col_order: @@ -669,7 +675,12 @@ def _read_gbq_table( ) if max_results: block = block.slice(stop=max_results) - return dataframe.DataFrame(block) + df = dataframe.DataFrame(block) + + # If user provided index columns, should sort over it + if len(index_cols) > 0: + df.sort_index() + return df def _check_index_uniqueness( self, table: ibis_types.Table, index_cols: List[str] diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index ca67710d4e..bb84721332 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -145,7 +145,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # Doing read_gbq* from a table in another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + ValueError, match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -193,7 +193,7 @@ def test_read_gbq_must_comply_with_set_location_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + ValueError, match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 30ea63b483..c6eee82053 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -52,14 +52,6 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, assert_loading_msg_exist(capsys.readouterr().out) -def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): - bf.options.display.progress_bar = "terminal" - capsys.readouterr() # clear output - session.read_gbq(penguins_table_id) - - assert_loading_msg_exist(capsys.readouterr().out) - - def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): From 53cee9930356413867262a23f75f8b2ec796ed4f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 01:47:04 +0000 Subject: [PATCH 04/12] order some ml outputs, fix test failulres --- bigframes/ml/core.py | 10 +- bigframes/session/__init__.py | 2 +- tests/system/small/ml/test_cluster.py | 110 +++++++++++--------- tests/system/small/ml/test_core.py | 104 +++++++++--------- tests/system/small/ml/test_decomposition.py | 104 +++++++++--------- tests/system/small/ml/test_forecasting.py | 6 ++ tests/system/small/test_pandas_options.py | 10 +- 7 files changed, 186 insertions(+), 160 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4c5a48cf62..b9627e5a71 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -126,7 +126,7 @@ def generate_text_embedding( def forecast(self) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_forecast() - return self._session.read_gbq(sql) + return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema @@ -139,14 +139,18 @@ def centroids(self) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_centroids() - return self._session.read_gbq(sql) + return self._session.read_gbq( + sql, index_col=["centroid_id", "feature"] + ).reset_index() def principal_components(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" sql = self._model_manipulation_sql_generator.ml_principal_components() - return self._session.read_gbq(sql) + return self._session.read_gbq( + sql, index_col=["principal_component_id", "feature"] + ).reset_index() def principal_component_info(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7249b32725..e436e8247b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -538,7 +538,7 @@ def _read_gbq_table_to_ibis_with_total_ordering( table = self.bqclient.get_table(table_ref) if enforce_region: - if table.location != self._location: + if table.location.casefold() != self._location.casefold(): raise ValueError( f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" ) diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index caeffa7768..266a38e3ee 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -89,59 +89,67 @@ def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): def test_kmeans_cluster_centers(penguins_kmeans_model: cluster.KMeans): - result = penguins_kmeans_model.cluster_centers_.to_pandas() - expected = pd.DataFrame( - { - "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], - "feature": [ - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "sex", - ] - * 3, - "numerical_value": [ - 47.509677, - 14.993548, - 217.040123, - pd.NA, - 38.207813, - 18.03125, - 187.992188, - pd.NA, - 47.036346, - 18.834808, - 197.1612, - pd.NA, - ], - "categorical_value": [ - [], - [], - [], - [ - {"category": ".", "value": 0.008064516129032258}, - {"category": "MALE", "value": 0.49193548387096775}, - {"category": "FEMALE", "value": 0.47580645161290325}, - {"category": "_null_filler", "value": 0.024193548387096774}, - ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.34375}, - {"category": "FEMALE", "value": 0.625}, - {"category": "_null_filler", "value": 0.03125}, + result = ( + penguins_kmeans_model.cluster_centers_.to_pandas() + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) + ) + expected = ( + pd.DataFrame( + { + "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], + "feature": [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + * 3, + "numerical_value": [ + 47.509677, + 14.993548, + 217.040123, + pd.NA, + 38.207813, + 18.03125, + 187.992188, + pd.NA, + 47.036346, + 18.834808, + 197.1612, + pd.NA, ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.6847826086956522}, - {"category": "FEMALE", "value": 0.2826086956521739}, - {"category": "_null_filler", "value": 0.03260869565217391}, + "categorical_value": [ + [], + [], + [], + [ + {"category": ".", "value": 0.008064516129032258}, + {"category": "MALE", "value": 0.49193548387096775}, + {"category": "FEMALE", "value": 0.47580645161290325}, + {"category": "_null_filler", "value": 0.024193548387096774}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.34375}, + {"category": "FEMALE", "value": 0.625}, + {"category": "_null_filler", "value": 0.03125}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.6847826086956522}, + {"category": "FEMALE", "value": 0.2826086956521739}, + {"category": "_null_filler", "value": 0.03260869565217391}, + ], ], - ], - }, + }, + ) + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index ec1f351d87..732ec4bb5e 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -148,59 +148,63 @@ def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel) # result is too long, only check the first principal component here. result = result.head(7) - expected = pd.DataFrame( - { - "principal_component_id": [0] * 7, - "feature": [ - "species", - "island", - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "body_mass_g", - "sex", - ], - "numerical_value": [ - pd.NA, - pd.NA, - 0.401489, - -0.377482, - 0.524052, - 0.501174, - pd.NA, - ], - "categorical_value": [ - [ - { - "category": "Gentoo penguin (Pygoscelis papua)", - "value": 0.25068877125667804, - }, - { - "category": "Adelie Penguin (Pygoscelis adeliae)", - "value": -0.20622291900416198, - }, - { - "category": "Chinstrap penguin (Pygoscelis antarctica)", - "value": -0.030161149275185855, - }, + expected = ( + pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", ], - [ - {"category": "Biscoe", "value": 0.19761120114410635}, - {"category": "Dream", "value": -0.11264736305259061}, - {"category": "Torgersen", "value": -0.07065913511418596}, + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, ], - [], - [], - [], - [], - [ - {"category": ".", "value": 0.0015916894448071784}, - {"category": "MALE", "value": 0.06869704739750442}, - {"category": "FEMALE", "value": -0.052521171596813174}, - {"category": "_null_filler", "value": -0.0034628622681684906}, + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], ], - ], - }, + }, + ) + .sort_values(["principal_component_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index cc4d2e5801..42fea66cf8 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -57,59 +57,63 @@ def test_pca_components_(penguins_pca_model: decomposition.PCA): # result is too long, only check the first principal component here. result = result.head(7) - expected = pd.DataFrame( - { - "principal_component_id": [0] * 7, - "feature": [ - "species", - "island", - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "body_mass_g", - "sex", - ], - "numerical_value": [ - pd.NA, - pd.NA, - 0.401489, - -0.377482, - 0.524052, - 0.501174, - pd.NA, - ], - "categorical_value": [ - [ - { - "category": "Gentoo penguin (Pygoscelis papua)", - "value": 0.25068877125667804, - }, - { - "category": "Adelie Penguin (Pygoscelis adeliae)", - "value": -0.20622291900416198, - }, - { - "category": "Chinstrap penguin (Pygoscelis antarctica)", - "value": -0.030161149275185855, - }, + expected = ( + pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", ], - [ - {"category": "Biscoe", "value": 0.19761120114410635}, - {"category": "Dream", "value": -0.11264736305259061}, - {"category": "Torgersen", "value": -0.07065913511418596}, + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, ], - [], - [], - [], - [], - [ - {"category": ".", "value": 0.0015916894448071784}, - {"category": "MALE", "value": 0.06869704739750442}, - {"category": "FEMALE", "value": -0.052521171596813174}, - {"category": "_null_filler", "value": -0.0034628622681684906}, + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], ], - ], - }, + }, + ) + .sort_values(["principal_component_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index cb27dd388c..c8906bc628 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -36,6 +36,12 @@ def test_model_predict(time_series_arima_plus_model): expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( pd.ArrowDtype(pa.timestamp("us", tz="UTC")) ) + + print("pandas") + print(expected.to_string()) + print("bigframes") + print(predictions.to_string()) + pd.testing.assert_frame_equal( predictions, expected, diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index bb84721332..c410d70fe7 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -74,7 +74,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent, ): read_method(query) @@ -99,7 +99,7 @@ def test_read_gbq_start_sets_session_location( # Now read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -145,7 +145,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( # Doing read_gbq* from a table in another location should fail with pytest.raises( - ValueError, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -193,7 +193,7 @@ def test_read_gbq_must_comply_with_set_location_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( - ValueError, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent_tokyo, ): read_method(query_tokyo) @@ -243,7 +243,7 @@ def test_read_gbq_must_comply_with_set_location_non_US( # Starting user journey with read_gbq* from another location should fail with pytest.raises( - google.api_core.exceptions.NotFound, + (google.api_core.exceptions.NotFound, ValueError), match=dataset_id_permanent, ): read_method(query) From e614482339bf7e9745e34da84cbab7981e80d0dc Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 02:05:58 +0000 Subject: [PATCH 05/12] more test, mypy fixes --- bigframes/session/__init__.py | 53 +++++---------- tests/system/small/ml/test_core.py | 104 +++++++++++++++-------------- 2 files changed, 70 insertions(+), 87 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index e436e8247b..eac5d75df3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -36,7 +36,6 @@ Tuple, Union, ) -import uuid import warnings import google.api_core.client_info @@ -505,7 +504,7 @@ def read_gbq_table( api_name="read_gbq_table", ) - def _read_gbq_table_to_ibis_with_total_ordering( + def _get_snapshot_sql_and_primary_key( self, table_ref: bigquery.table.TableReference, *, @@ -518,15 +517,6 @@ def _read_gbq_table_to_ibis_with_total_ordering( column(s), then return those too so that ordering generation can be avoided. """ - if table_ref.dataset_id.upper() == "_SESSION": - # _SESSION tables aren't supported by the tables.get REST API. - return ( - self.ibis_client.sql( - f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" - ), - None, - ) - table_expression = self.ibis_client.table( table_ref.table_id, database=f"{table_ref.project}.{table_ref.dataset_id}", @@ -551,22 +541,18 @@ def _read_gbq_table_to_ibis_with_total_ordering( .get("columns") ) - if not primary_keys: - return table_expression, None - else: - # Read from a snapshot since we won't have to copy the table data to create a total ordering. - job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name - current_timestamp = list( - self.bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, current_timestamp) - ) - return table_expression, primary_keys + job_config = bigquery.QueryJobConfig() + job_config.labels["bigframes-api"] = api_name + current_timestamp = list( + self.bqclient.query( + "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", + job_config=job_config, + ).result() + )[0][0] + table_expression = self.ibis_client.sql( + bigframes_io.create_snapshot_sql(table_ref, current_timestamp) + ) + return table_expression, primary_keys def _read_gbq_table( self, @@ -589,7 +575,7 @@ def _read_gbq_table( ( table_expression, total_ordering_cols, - ) = self._read_gbq_table_to_ibis_with_total_ordering( + ) = self._get_snapshot_sql_and_primary_key( table_ref, api_name=api_name, enforce_region=True ) @@ -843,7 +829,7 @@ def _read_pandas( job_config.clustering_fields = cluster_cols job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._create_session_table() + load_table_destination = bigframes_io.random_table(self._anonymous_dataset) load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1144,13 +1130,6 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) - def _create_session_table(self) -> bigquery.TableReference: - table_name = f"{uuid.uuid4().hex}" - dataset = bigquery.Dataset( - bigquery.DatasetReference(self.bqclient.project, "_SESSION") - ) - return dataset.table(table_name) - def _create_empty_temp_table( self, schema: Iterable[bigquery.SchemaField], @@ -1457,7 +1436,7 @@ def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: # Some of these probably don't work col_type = column.type() if col_type.is_array() or col_type.is_struct(): - result = vendored_ibis_ops.ToJsonString(column).to_expr() + result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore elif col_type.is_geospatial(): result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() elif col_type.is_string(): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 732ec4bb5e..be34a4871c 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -78,58 +78,62 @@ def test_model_eval_with_data(penguins_bqml_linear_model, penguins_df_default_in def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel): result = penguins_bqml_kmeans_model.centroids().to_pandas() - expected = pd.DataFrame( - { - "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], - "feature": [ - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "sex", - ] - * 3, - "numerical_value": [ - 47.509677, - 14.993548, - 217.040123, - pd.NA, - 38.207813, - 18.03125, - 187.992188, - pd.NA, - 47.036346, - 18.834808, - 197.1612, - pd.NA, - ], - "categorical_value": [ - [], - [], - [], - [ - {"category": ".", "value": 0.008064516129032258}, - {"category": "MALE", "value": 0.49193548387096775}, - {"category": "FEMALE", "value": 0.47580645161290325}, - {"category": "_null_filler", "value": 0.024193548387096774}, - ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.34375}, - {"category": "FEMALE", "value": 0.625}, - {"category": "_null_filler", "value": 0.03125}, + expected = ( + pd.DataFrame( + { + "centroid_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], + "feature": [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + * 3, + "numerical_value": [ + 47.509677, + 14.993548, + 217.040123, + pd.NA, + 38.207813, + 18.03125, + 187.992188, + pd.NA, + 47.036346, + 18.834808, + 197.1612, + pd.NA, ], - [], - [], - [], - [ - {"category": "MALE", "value": 0.6847826086956522}, - {"category": "FEMALE", "value": 0.2826086956521739}, - {"category": "_null_filler", "value": 0.03260869565217391}, + "categorical_value": [ + [], + [], + [], + [ + {"category": ".", "value": 0.008064516129032258}, + {"category": "MALE", "value": 0.49193548387096775}, + {"category": "FEMALE", "value": 0.47580645161290325}, + {"category": "_null_filler", "value": 0.024193548387096774}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.34375}, + {"category": "FEMALE", "value": 0.625}, + {"category": "_null_filler", "value": 0.03125}, + ], + [], + [], + [], + [ + {"category": "MALE", "value": 0.6847826086956522}, + {"category": "FEMALE", "value": 0.2826086956521739}, + {"category": "_null_filler", "value": 0.03260869565217391}, + ], ], - ], - }, + }, + ) + .sort_values(["centroid_id", "feature"]) + .reset_index(drop=True) ) pd.testing.assert_frame_equal( result, From 945742fa51f4ac694a8e8c1ec49a3e135ccde3bf Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 02:31:15 +0000 Subject: [PATCH 06/12] revert removal of session table from read_pandas --- bigframes/session/__init__.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index eac5d75df3..873bb58f3e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -36,6 +36,7 @@ Tuple, Union, ) +import uuid import warnings import google.api_core.client_info @@ -517,6 +518,14 @@ def _get_snapshot_sql_and_primary_key( column(s), then return those too so that ordering generation can be avoided. """ + if table_ref.dataset_id.upper() == "_SESSION": + # _SESSION tables aren't supported by the tables.get REST API. + return ( + self.ibis_client.sql( + f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`" + ), + None, + ) table_expression = self.ibis_client.table( table_ref.table_id, database=f"{table_ref.project}.{table_ref.dataset_id}", @@ -829,7 +838,7 @@ def _read_pandas( job_config.clustering_fields = cluster_cols job_config.labels = {"bigframes-api": api_name} - load_table_destination = bigframes_io.random_table(self._anonymous_dataset) + load_table_destination = self._create_session_table() load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1130,6 +1139,13 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) + def _create_session_table(self) -> bigquery.TableReference: + table_name = f"{uuid.uuid4().hex}" + dataset = bigquery.Dataset( + bigquery.DatasetReference(self.bqclient.project, "_SESSION") + ) + return dataset.table(table_name) + def _create_empty_temp_table( self, schema: Iterable[bigquery.SchemaField], From d8e99f6c8918a93d6cf10ada6a7da58156edc222 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 20:05:12 +0000 Subject: [PATCH 07/12] address pr comments --- bigframes/core/__init__.py | 2 +- bigframes/ml/sql.py | 5 +++- bigframes/session/__init__.py | 36 ++++++++++++----------- tests/system/small/ml/test_forecasting.py | 5 ---- tests/system/small/test_dataframe_io.py | 18 ++++++++++++ 5 files changed, 42 insertions(+), 24 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 63f36d4ddd..b640692bc8 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -165,7 +165,7 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: ibis_expr = compiled_value._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True ) - tmp_table = self.session._ibis_to_session_table( + tmp_table = self.session._ibis_to_temp_table( ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 601b271099..2d624d1990 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -132,7 +132,10 @@ def create_model( transforms: Optional[Iterable[str]] = None, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML""" - source_sql = source_df.sql + # TODO: Compile unordered sql here instead of ordered sql + # Cache as underlying table may use snapshot, which is incompatible with model generation + # Cached df creates a non-snapshot copy. + source_sql = source_df._cached().sql transform_sql = self.transform(*transforms) if transforms is not None else None options_sql = self.options(**options) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 873bb58f3e..a3dd181d86 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -510,7 +510,6 @@ def _get_snapshot_sql_and_primary_key( table_ref: bigquery.table.TableReference, *, api_name: str, - enforce_region: bool = False, ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]: """Create a read-only Ibis table expression representing a table. @@ -536,11 +535,10 @@ def _get_snapshot_sql_and_primary_key( # the same assumption and use these columns as the total ordering keys. table = self.bqclient.get_table(table_ref) - if enforce_region: - if table.location.casefold() != self._location.casefold(): - raise ValueError( - f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" - ) + if table.location.casefold() != self._location.casefold(): + raise ValueError( + f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" + ) # TODO(b/305264153): Use public properties to fetch primary keys once # added to google-cloud-bigquery. @@ -584,9 +582,7 @@ def _read_gbq_table( ( table_expression, total_ordering_cols, - ) = self._get_snapshot_sql_and_primary_key( - table_ref, api_name=api_name, enforce_region=True - ) + ) = self._get_snapshot_sql_and_primary_key(table_ref, api_name=api_name) for key in col_order: if key not in table_expression.columns: @@ -605,6 +601,9 @@ def _read_gbq_table( f"Column `{key}` of `index_col` not found in this table." ) + if col_order: + table_expression = table_expression.select([*index_cols, *col_order]) + # If the index is unique and sortable, then we don't need to generate # an ordering column. ordering = None @@ -658,9 +657,6 @@ def _read_gbq_table( else: array_value = self._create_total_ordering(table_expression) - if col_order: - array_value = array_value.select_columns(tuple(col_order)) - value_columns = [col for col in array_value.column_ids if col not in index_cols] block = blocks.Block( array_value, @@ -1213,7 +1209,7 @@ def _create_total_ordering( ordering=ordering, ) - def _ibis_to_session_table( + def _ibis_to_temp_table( self, table: ibis_types.Table, cluster_cols: Iterable[str], @@ -1449,14 +1445,20 @@ def _can_cluster_bq(field: bigquery.SchemaField): def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: - # Some of these probably don't work col_type = column.type() - if col_type.is_array() or col_type.is_struct(): - result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore + if ( + col_type.is_numeric() + or col_type.is_boolean() + or col_type.is_binary() + or col_type.is_temporal() + ): + result = column.cast(ibis_dtypes.String(nullable=True)) elif col_type.is_geospatial(): result = typing.cast(ibis_types.GeoSpatialColumn, column).as_text() elif col_type.is_string(): result = column else: - result = column.cast(ibis_dtypes.String(nullable=True)) + # TO_JSON_STRING works with all data types, but isn't the most efficient + # Needed for JSON, STRUCT and ARRAY datatypes + result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore return typing.cast(ibis_types.StringColumn, result) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index c8906bc628..55079c94cf 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -37,11 +37,6 @@ def test_model_predict(time_series_arima_plus_model): pd.ArrowDtype(pa.timestamp("us", tz="UTC")) ) - print("pandas") - print(expected.to_string()) - print("bigframes") - print(predictions.to_string()) - pd.testing.assert_frame_equal( predictions, expected, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d700d93be9..fb9fb7bb89 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -80,6 +80,24 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_load_json(session): + df = session.read_gbq( + """SELECT + JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column + """ + ) + + result = df.to_pandas() + expected = pd.DataFrame( + { + "json_column": ['{"bar":true,"foo":10}'], + } + ) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, expected.dtypes) + pd.testing.assert_series_equal(result["json_column"], expected["json_column"]) + + def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas_batches() APIs returns the expected dtypes.""" expected = scalars_df_default_index.dtypes From 69b8e28b3d1c58a2ee761cb03806ff88bc770ced Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 10 Nov 2023 12:44:34 -0800 Subject: [PATCH 08/12] Update bigframes/session/__init__.py Co-authored-by: Tim Swast --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a3dd181d86..5fbbc4cd3c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -608,7 +608,7 @@ def _read_gbq_table( # an ordering column. ordering = None if total_ordering_cols is not None: - # Note: currently, this a table has a total ordering only when the + # Note: currently, a table has a total ordering only when the # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. ordering = orderings.ExpressionOrdering( From 85a54f5cfba18eb9c5bace3bfe4d543b8faea16e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 20:54:18 +0000 Subject: [PATCH 09/12] remove outdated comment --- bigframes/session/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2f49e98fa8..9f70fc874a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1186,7 +1186,6 @@ def _create_total_ordering( # Used to disambiguate between identical rows (which will have identical hash) random_value = ibis.random().name(ordering_rand_part) - # Might need to actually reselect here? original_column_ids = table.columns table_with_ordering = table.select( itertools.chain(original_column_ids, [full_row_hash, random_value]) From ef3dd3d4ea94a28003964b9958b0153631d9f38e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 21:23:49 +0000 Subject: [PATCH 10/12] make _cached return self and fix golden sql ml tests --- bigframes/dataframe.py | 3 ++- bigframes/ml/core.py | 6 ++++-- bigframes/ml/sql.py | 5 +---- bigframes/series.py | 3 ++- tests/unit/ml/test_golden_sql.py | 2 ++ 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0a03575491..bdbc00e620 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2719,7 +2719,8 @@ def _get_block(self) -> blocks.Block: return self._block def _cached(self) -> DataFrame: - return DataFrame(self._block.cached()) + self._set_block(self._block.cached()) + return self _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index b9627e5a71..5630b34f2d 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -232,10 +232,12 @@ def create_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ options = dict(options) + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot if y_train is None: - input_data = X_train + input_data = X_train._cached() else: - input_data = X_train.join(y_train, how="outer") + input_data = X_train._cached().join(y_train._cached(), how="outer") options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 2d624d1990..601b271099 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -132,10 +132,7 @@ def create_model( transforms: Optional[Iterable[str]] = None, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML""" - # TODO: Compile unordered sql here instead of ordered sql - # Cache as underlying table may use snapshot, which is incompatible with model generation - # Cached df creates a non-snapshot copy. - source_sql = source_df._cached().sql + source_sql = source_df.sql transform_sql = self.transform(*transforms) if transforms is not None else None options_sql = self.options(**options) diff --git a/bigframes/series.py b/bigframes/series.py index 4fab1fe943..28290d591e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1503,7 +1503,8 @@ def _slice( ) def _cached(self) -> Series: - return Series(self._block.cached()) + self._set_block(self._block.cached()) + return self def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 3ca7e144a5..700eb500ff 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -38,6 +38,7 @@ def mock_session(): def mock_y(): mock_y = mock.create_autospec(spec=bpd.DataFrame) mock_y.columns = pd.Index(["input_column_label"]) + mock_y._cached.return_value = mock_y return mock_y @@ -57,6 +58,7 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + mock_X._cached.return_value = mock_X return mock_X From 286578832fd88b8d11e1cb07efb7ced2964d310d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 22:16:08 +0000 Subject: [PATCH 11/12] cache before creating time series model to avoid snapshot --- bigframes/ml/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 5630b34f2d..39d01fca8d 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -265,7 +265,9 @@ def create_time_series_model( ), "Time stamp data input must only contain 1 column." options = dict(options) - input_data = X_train.join(y_train, how="outer") + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot + input_data = X_train._cached().join(y_train._cached(), how="outer") options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) From ee1f3c5c733fb41eccaf666f4a55c4e348d29569 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 23:31:34 +0000 Subject: [PATCH 12/12] remove doctest lines dependent on unspecified ordering --- bigframes/session/__init__.py | 16 ---------------- third_party/bigframes_vendored/pandas/io/gbq.py | 10 ---------- .../bigframes_vendored/pandas/io/parquet.py | 6 ------ .../bigframes_vendored/pandas/io/pickle.py | 10 ---------- 4 files changed, 42 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9f70fc874a..82c5a1c8d0 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -379,12 +379,6 @@ def read_gbq_query( ... pitchSpeed, ... FROM `bigquery-public-data.baseball.games_wide` ... ''') - >>> df.head(2) - pitcherFirstName pitcherLastName pitchSpeed - 0 0 - 1 0 - - [2 rows x 3 columns] Preserve ordering in a query input. @@ -481,16 +475,6 @@ def read_gbq_table( Read a whole table, with arbitrary ordering or ordering corresponding to the primary key(s). >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] See also: :meth:`Session.read_gbq`. """ diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 575c501618..2161310b07 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -45,16 +45,6 @@ def read_gbq( If the input is a table ID: >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] Preserve ordering in a query input. diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index f97bd386a4..0f664e70fc 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -24,12 +24,6 @@ def read_parquet( >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" >>> df = bpd.read_parquet(path=gcs_path) - >>> df.head(2) - name post_abbr - 0 Alabama AL - 1 Alaska AK - - [2 rows x 2 columns] Args: path (str): diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 053ba4871c..096d9b13d6 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -32,16 +32,6 @@ def read_pickle( >>> gcs_path = "gs://bigframes-dev-testing/test_pickle.pkl" >>> df = bpd.read_pickle(filepath_or_buffer=gcs_path) - >>> df.head(2) - species island culmen_length_mm \\ - 0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 - 1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 - - culmen_depth_mm flipper_length_mm body_mass_g sex - 0 18.4 184.0 3475.0 FEMALE - 1 19.1 184.0 4650.0 MALE - - [2 rows x 7 columns] Args: filepath_or_buffer (str, path object, or file-like object):