From 104611b4bc09fdb9bf1299c66e30f6d38dea22f0 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 23:13:18 +0000 Subject: [PATCH] perf: Make repr cache the block where appropriate --- bigframes/core/blocks.py | 13 ++++++- bigframes/core/nodes.py | 69 ++++++++++++++++++++++++++++++++++- bigframes/core/ordering.py | 8 ++++ bigframes/core/traversal.py | 27 ++++++++++++++ bigframes/dataframe.py | 12 +++++- bigframes/ml/core.py | 10 +++-- bigframes/series.py | 5 ++- bigframes/session/__init__.py | 38 ++++++++++++++++++- 8 files changed, 171 insertions(+), 11 deletions(-) create mode 100644 bigframes/core/traversal.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index aa9649f272..d86875ae62 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1697,10 +1697,19 @@ def to_sql_query( idx_labels, ) - def cached(self) -> Block: + def cached(self, *, optimize_offsets=False, force: bool = False) -> Block: """Write the block to a session table and create a new block object that references it.""" + # use a heuristic for whether something needs to be cached + if (not force) and self.session._is_trivially_executable(self.expr): + return self + if optimize_offsets: + expr = self.session._cache_with_offsets(self.expr) + else: + expr = self.session._cache_with_cluster_cols( + self.expr, cluster_cols=self.index_columns + ) return Block( - self.session._execute_and_cache(self.expr, cluster_cols=self.index_columns), + expr, index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index_labels, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index e1882c3684..99ea229a44 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -50,6 +50,19 @@ def deterministic(self) -> bool: """Whether this node will evaluates deterministically.""" return True + @property + def row_preserving(self) -> bool: + """Whether this node preserves input rows.""" + return True + + @property + def non_local(self) -> bool: + """ + Whether this node combines information across multiple rows instead of processing rows independently. + Used as an approximation for whether the expression may require shuffling to execute (and therefore be expensive). + """ + return False + @property def child_nodes(self) -> typing.Sequence[BigFrameNode]: """Direct children of this node""" @@ -104,6 +117,14 @@ class JoinNode(BigFrameNode): join: JoinDefinition allow_row_identity_join: bool = True + @property + def row_preserving(self) -> bool: + return False + + @property + def non_local(self) -> bool: + return True + @property def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.left_child, self.right_child) @@ -184,11 +205,19 @@ def __hash__(self): def peekable(self) -> bool: return False + @property + def non_local(self) -> bool: + return False + @dataclass(frozen=True) class FilterNode(UnaryNode): predicate: ex.Expression + @property + def row_preserving(self) -> bool: + return False + def __hash__(self): return self._node_hash @@ -221,7 +250,13 @@ def __hash__(self): # TODO: Merge RowCount and Corr into Aggregate Node @dataclass(frozen=True) class RowCountNode(UnaryNode): - pass + @property + def row_preserving(self) -> bool: + return False + + @property + def non_local(self) -> bool: + return True @dataclass(frozen=True) @@ -230,6 +265,10 @@ class AggregateNode(UnaryNode): by_column_ids: typing.Tuple[str, ...] = tuple([]) dropna: bool = True + @property + def row_preserving(self) -> bool: + return False + def __hash__(self): return self._node_hash @@ -237,6 +276,10 @@ def __hash__(self): def peekable(self) -> bool: return False + @property + def non_local(self) -> bool: + return True + # TODO: Unify into aggregate @dataclass(frozen=True) @@ -246,10 +289,18 @@ class CorrNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def row_preserving(self) -> bool: + return False + @property def peekable(self) -> bool: return False + @property + def non_local(self) -> bool: + return True + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -267,6 +318,10 @@ def __hash__(self): def peekable(self) -> bool: return False + @property + def non_local(self) -> bool: + return True + @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): @@ -290,6 +345,14 @@ class UnpivotNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def row_preserving(self) -> bool: + return False + + @property + def non_local(self) -> bool: + return True + @property def peekable(self) -> bool: return False @@ -303,5 +366,9 @@ class RandomSampleNode(UnaryNode): def deterministic(self) -> bool: return False + @property + def row_preserving(self) -> bool: + return False + def __hash__(self): return self._node_hash diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 3ab89e0213..1fd5ab4e37 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -92,6 +92,14 @@ class ExpressionOrdering: # Therefore, any modifications(or drops) done to these columns must result in hidden copies being made. total_ordering_columns: frozenset[str] = field(default_factory=frozenset) + @classmethod + def from_offset_col(cls, col: str) -> ExpressionOrdering: + return ExpressionOrdering( + (OrderingColumnReference(col),), + integer_encoding=IntegerEncoding(True, is_sequential=True), + total_ordering_columns=frozenset({col}), + ) + def with_non_sequential(self): """Create a copy that is marked as non-sequential. diff --git a/bigframes/core/traversal.py b/bigframes/core/traversal.py new file mode 100644 index 0000000000..b038ee6599 --- /dev/null +++ b/bigframes/core/traversal.py @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.nodes as nodes + + +def is_trivially_executable(node: nodes.BigFrameNode) -> bool: + if local_only(node): + return True + children_trivial = all(is_trivially_executable(child) for child in node.child_nodes) + self_trivial = (not node.non_local) and (node.row_preserving) + return children_trivial and self_trivial + + +def local_only(node: nodes.BigFrameNode) -> bool: + return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9989831e1b..86417afbce 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -587,6 +587,8 @@ def __repr__(self) -> str: max_results = opts.max_rows if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) + + self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -624,6 +626,8 @@ def _repr_html_(self) -> str: max_results = bigframes.options.display.max_rows if opts.repr_mode == "deferred": return formatter.repr_query_job_html(self.query_job) + + self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -3089,8 +3093,12 @@ def _set_block(self, block: blocks.Block): def _get_block(self) -> blocks.Block: return self._block - def _cached(self) -> DataFrame: - self._set_block(self._block.cached()) + def _cached(self, *, force: bool = False) -> DataFrame: + """Materialize dataframe to a temporary table. + No-op if the dataframe represents a trivial transformation of an existing materialization. + Force=True is used for BQML integration where need to copy data rather than use snapshot. + """ + self._set_block(self._block.cached(force=force)) return self _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 7c156b4cb7..266ab1b058 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -247,9 +247,11 @@ def create_model( # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot if y_train is None: - input_data = X_train._cached() + input_data = X_train._cached(force=True) else: - input_data = X_train._cached().join(y_train._cached(), how="outer") + input_data = X_train._cached(force=True).join( + y_train._cached(force=True), how="outer" + ) options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -281,7 +283,9 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train._cached().join(y_train._cached(), how="outer") + input_data = X_train._cached(force=True).join( + y_train._cached(force=True), how="outer" + ) options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/bigframes/series.py b/bigframes/series.py index e049b41461..e328be4936 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -259,6 +259,7 @@ def __repr__(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self.query_job) + self._cached() pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) self._set_internal_query_job(query_job) @@ -1523,8 +1524,8 @@ def _slice( ), ) - def _cached(self) -> Series: - self._set_block(self._block.cached()) + def _cached(self, *, force: bool = True) -> Series: + self._set_block(self._block.cached(force=force)) return self diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index da39ab43ce..0c5162ef10 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -71,6 +71,7 @@ import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference import bigframes.core.ordering as orderings +import bigframes.core.traversal as traversals import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers @@ -1462,7 +1463,7 @@ def _start_query( results_iterator = query_job.result(max_results=max_results) return results_iterator, query_job - def _execute_and_cache( + def _cache_with_cluster_cols( self, array_value: core.ArrayValue, cluster_cols: typing.Sequence[str] ) -> core.ArrayValue: """Executes the query and uses the resulting table to rewrite future executions.""" @@ -1493,6 +1494,41 @@ def _execute_and_cache( ordering=compiled_value._ordering, ) + def _cache_with_offsets(self, array_value: core.ArrayValue) -> core.ArrayValue: + """Executes the query and uses the resulting table to rewrite future executions.""" + # TODO: Use this for all executions? Problem is that caching materializes extra + # ordering columns + compiled_value = self._compile_ordered(array_value) + + ibis_expr = compiled_value._to_ibis_expr( + ordering_mode="offset_col", order_col_name="bigframes_offsets" + ) + tmp_table = self._ibis_to_temp_table( + ibis_expr, cluster_cols=["bigframes_offsets"], api_name="cached" + ) + table_expression = self.ibis_client.table( + f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" + ) + new_columns = [table_expression[column] for column in compiled_value.column_ids] + new_hidden_columns = [table_expression["bigframes_offsets"]] + # TODO: Instead, keep session-wide map of cached results and automatically reuse + return core.ArrayValue.from_ibis( + self, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=orderings.ExpressionOrdering.from_offset_col("bigframes_offsets"), + ) + + def _is_trivially_executable(self, array_value: core.ArrayValue): + """ + Can the block be evaluated very cheaply? + If True, the array_value probably is not worth caching. + """ + # Once rewriting is available, will want to rewrite before + # evaluating execution cost. + return traversals.is_trivially_executable(array_value.node) + def _execute( self, array_value: core.ArrayValue,