From 155ca7d41091aff49bfde7bfa39d3ec858cac4a7 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 23:13:18 +0000 Subject: [PATCH 1/5] feat: add efficient peek dataframe preview --- bigframes/core/__init__.py | 4 +- bigframes/core/blocks.py | 10 +++++ bigframes/core/compile/__init__.py | 6 +-- bigframes/core/compile/compiled.py | 7 ++++ bigframes/core/compile/compiler.py | 38 ++++++++++--------- bigframes/core/nodes.py | 55 ++++++++++++++++++++++++++++ bigframes/dataframe.py | 8 ++++ bigframes/session/__init__.py | 15 +++++++- tests/system/small/test_dataframe.py | 22 +++++++++++ 9 files changed, 141 insertions(+), 24 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e32977fbce..77432605fc 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -104,10 +104,10 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self._compile_ordered().get_column_type(key) def _compile_ordered(self) -> compiling.OrderedIR: - return compiling.compile_ordered(self.node) + return compiling.compile_ordered_ir(self.node) def _compile_unordered(self) -> compiling.UnorderedIR: - return compiling.compile_unordered(self.node) + return compiling.compile_unordered_ir(self.node) def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1960def0d5..369874164f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -432,8 +432,18 @@ def to_pandas( downsampling=sampling, ordered=ordered ) ) + df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job + def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]: + if self.expr.node.peekable: + iterator, _ = self.session._peek(self.expr, n) + df = self._to_dataframe(iterator) + self._copy_index_to_pandas(df) + return df + else: + return None + def to_pandas_batches(self): """Download results one message at a time.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 761fd9a465..c3e2bd832a 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,11 +13,11 @@ # limitations under the License. from bigframes.core.compile.compiled import OrderedIR, UnorderedIR -from bigframes.core.compile.compiler import compile_ordered, compile_unordered +from bigframes.core.compile.compiler import compile_ordered_ir, compile_unordered_ir __all__ = [ - "compile_ordered", - "compile_unordered", + "compile_ordered_ir", + "compile_unordered_ir", "OrderedIR", "UnorderedIR", ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6a3db3f2bd..410d4b9fcd 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -233,6 +233,13 @@ def builder(self): predicates=self._predicates, ) + def peek_sql(self, n: int): + # Peek currently implemented as top level LIMIT op. + # Execution engine handles limit pushdown. + # In future, may push down limit/filters in compilation. + sql = ibis_bigquery.Backend().compile(self._to_ibis_expr().limit(n)) + return typing.cast(str, sql) + def to_sql( self, offset_column: typing.Optional[str] = None, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index eacee26bcb..3d0300cae0 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -29,14 +29,18 @@ import bigframes.session -def compile_ordered(node: nodes.BigFrameNode) -> compiled.OrderedIR: +def compile_ordered_ir(node: nodes.BigFrameNode) -> compiled.OrderedIR: return typing.cast(compiled.OrderedIR, compile_node(node, True)) -def compile_unordered(node: nodes.BigFrameNode) -> compiled.UnorderedIR: +def compile_unordered_ir(node: nodes.BigFrameNode) -> compiled.UnorderedIR: return typing.cast(compiled.UnorderedIR, compile_node(node, False)) +def compile_peak_sql(node: nodes.BigFrameNode, n_rows: int) -> typing.Optional[str]: + return compile_unordered_ir(node).peek_sql(n_rows) + + @functools.cache def compile_node( node: nodes.BigFrameNode, ordered: bool = True @@ -56,8 +60,8 @@ def _compile_node( @_compile_node.register def compile_join(node: nodes.JoinNode, ordered: bool = True): if ordered: - left_ordered = compile_ordered(node.left_child) - right_ordered = compile_ordered(node.right_child) + left_ordered = compile_ordered_ir(node.left_child) + right_ordered = compile_ordered_ir(node.right_child) return bigframes.core.compile.single_column.join_by_column_ordered( left_ordered, node.left_column_ids, @@ -67,8 +71,8 @@ def compile_join(node: nodes.JoinNode, ordered: bool = True): allow_row_identity_join=node.allow_row_identity_join, ) else: - left_unordered = compile_unordered(node.left_child) - right_unordered = compile_unordered(node.right_child) + left_unordered = compile_unordered_ir(node.left_child) + right_unordered = compile_unordered_ir(node.right_child) return bigframes.core.compile.single_column.join_by_column_unordered( left_unordered, node.left_column_ids, @@ -117,7 +121,7 @@ def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): @_compile_node.register def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): - result = compile_ordered(node.child).promote_offsets(node.col_id) + result = compile_ordered_ir(node.child).promote_offsets(node.col_id) return result if ordered else result.to_unordered() @@ -129,17 +133,17 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True): @_compile_node.register def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): if ordered: - return compile_ordered(node.child).order_by(node.by) + return compile_ordered_ir(node.child).order_by(node.by) else: - return compile_unordered(node.child) + return compile_unordered_ir(node.child) @_compile_node.register def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): if ordered: - return compile_ordered(node.child).reversed() + return compile_ordered_ir(node.child).reversed() else: - return compile_unordered(node.child) + return compile_unordered_ir(node.child) @_compile_node.register @@ -153,22 +157,22 @@ def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): @_compile_node.register def compile_concat(node: nodes.ConcatNode, ordered: bool = True): if ordered: - compiled_ordered = [compile_ordered(node) for node in node.children] + compiled_ordered = [compile_ordered_ir(node) for node in node.children] return concat_impl.concat_ordered(compiled_ordered) else: - compiled_unordered = [compile_unordered(node) for node in node.children] + compiled_unordered = [compile_unordered_ir(node) for node in node.children] return concat_impl.concat_unordered(compiled_unordered) @_compile_node.register def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): - result = compile_unordered(node.child).row_count() + result = compile_unordered_ir(node.child).row_count() return result if ordered else result.to_unordered() @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - result = compile_unordered(node.child).aggregate( + result = compile_unordered_ir(node.child).aggregate( node.aggregations, node.by_column_ids, node.dropna ) return result if ordered else result.to_unordered() @@ -176,13 +180,13 @@ def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): @_compile_node.register def compile_corr(node: nodes.CorrNode, ordered: bool = True): - result = compile_unordered(node.child).corr_aggregate(node.corr_aggregations) + result = compile_unordered_ir(node.child).corr_aggregate(node.corr_aggregations) return result if ordered else result.to_unordered() @_compile_node.register def compile_window(node: nodes.WindowOpNode, ordered: bool = True): - result = compile_ordered(node.child).project_window_op( + result = compile_ordered_ir(node.child).project_window_op( node.column_name, node.op, node.window_spec, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 360f853e3e..50648ff1dc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -16,6 +16,7 @@ from dataclasses import dataclass, field, fields import functools +import itertools import typing from typing import Tuple @@ -73,6 +74,18 @@ def session(self): def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) + @property + def peekable(self) -> bool: + """Indicates whether the node can be sampled efficiently""" + return all(child.peekable for child in self.child_nodes) + + @property + def roots(self) -> typing.Set[BigFrameNode]: + roots = itertools.chain.from_iterable( + map(lambda child: child.roots, self.child_nodes) + ) + return set(roots) + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -105,6 +118,12 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + children_peekable = all(child.peekable for child in self.child_nodes) + single_root = len(self.roots) == 1 + return children_peekable and single_root + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -126,6 +145,14 @@ class ReadLocalNode(BigFrameNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return True + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + # TODO: Refactor to take raw gbq object reference @dataclass(frozen=True) @@ -143,6 +170,14 @@ def session(self): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return True + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + # Unary nodes @dataclass(frozen=True) @@ -160,6 +195,10 @@ class PromoteOffsetsNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class FilterNode(UnaryNode): @@ -218,6 +257,10 @@ class AggregateNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + # TODO: Unify into aggregate @dataclass(frozen=True) @@ -227,6 +270,10 @@ class CorrNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -240,6 +287,10 @@ class WindowOpNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): @@ -263,6 +314,10 @@ class UnpivotNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class AssignNode(UnaryNode): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1288117395..7b6e10bc60 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1066,6 +1066,14 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) + def peek(self, n: int = 5) -> pandas.DataFrame: + maybe_result = self._block.try_peek(n) + if maybe_result is None: + raise NotImplementedError( + "Cannot peek efficiently when data has aggregates, joins or window functions applied." + ) + return maybe_result.set_axis(self._block.column_labels, axis=1, copy=False) + def nlargest( self, n: int, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d503b844aa..10b4e9efa7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1508,6 +1508,17 @@ def _execute( job_config=job_config, ) + def _peek( + self, array_value: core.ArrayValue, n_rows: int + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """A 'peek' efficiently accesses a small number of rows in the dataframe.""" + if not array_value.node.peekable: + raise NotImplementedError("cannot efficient peek this dataframe") + sql = self._compile_unordered(array_value).peek_sql(n_rows) + return self._start_query( + sql=sql, + ) + def _to_sql( self, array_value: core.ArrayValue, @@ -1528,12 +1539,12 @@ def _to_sql( def _compile_ordered( self, array_value: core.ArrayValue ) -> bigframes.core.compile.OrderedIR: - return bigframes.core.compile.compile_ordered(array_value.node) + return bigframes.core.compile.compile_ordered_ir(array_value.node) def _compile_unordered( self, array_value: core.ArrayValue ) -> bigframes.core.compile.UnorderedIR: - return bigframes.core.compile.compile_unordered(array_value.node) + return bigframes.core.compile.compile_unordered_ir(array_value.node) def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9557475b46..a9a4fa6c89 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -412,6 +412,28 @@ def test_rename(scalars_dfs): ) +def test_df_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df.peek(n=3) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + # Window ops aren't compatible with efficient peeking + scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + + def test_repr_w_all_rows(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From 8403f41f9e04ce7ab25c906d31e86942c4151b1b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Jan 2024 20:40:18 +0000 Subject: [PATCH 2/5] add force parameter to peek to cache full dataframe --- bigframes/dataframe.py | 13 +++++++++---- tests/system/small/test_dataframe.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7b6e10bc60..ac90422e86 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1066,12 +1066,17 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) - def peek(self, n: int = 5) -> pandas.DataFrame: + def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: maybe_result = self._block.try_peek(n) if maybe_result is None: - raise NotImplementedError( - "Cannot peek efficiently when data has aggregates, joins or window functions applied." - ) + if force: + self._cached() + maybe_result = self._block.try_peek(n) + assert maybe_result is not None + else: + raise NotImplementedError( + "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe." + ) return maybe_result.set_axis(self._block.column_labels, axis=1, copy=False) def nlargest( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a9a4fa6c89..28126a5134 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -431,7 +431,16 @@ def test_df_peek_exception(scalars_dfs): with pytest.raises(NotImplementedError): # Window ops aren't compatible with efficient peeking - scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) + + +def test_df_peek_force(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=True) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 def test_repr_w_all_rows(scalars_dfs): From 7218102ec4693303a7037fc75b2cf7167af300b7 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Jan 2024 20:53:57 +0000 Subject: [PATCH 3/5] add df.peek docstring --- bigframes/dataframe.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ac90422e86..d833b9c5e0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1067,6 +1067,23 @@ def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + """ + Preview n arbitrary rows from the dataframe. No guarantees about row selection or ordering. + DataFrame.peek(force=False) is much faster than DataFrame.peek, but will only succeed in the + absence of joins, aggregations, and analytic operators. + + Args: + n (int, default 5): + The number of rows to select from the dataframe. Which N rows are returned is non-deterministic. + force (bool, default True): + If the data cannot be peeked efficiently, the dataframe will instead be fully materialized as part + of the operation if force=True. I force=False, the operation will throw a NotImplementedError. + Returns: + pandas.DataFrame: A pandas DataFrame with n rows. + + Raises: + NotImplementedError: If force=False and data cannot be efficiently peeked. + """ maybe_result = self._block.try_peek(n) if maybe_result is None: if force: From 272f98bdebbc25452307ef1a9c17f655ac384313 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 24 Jan 2024 23:41:52 +0000 Subject: [PATCH 4/5] set peek to default force=False --- bigframes/dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d833b9c5e0..6377e7af21 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1066,7 +1066,7 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + def peek(self, n: int = 5, *, force: bool = False) -> pandas.DataFrame: """ Preview n arbitrary rows from the dataframe. No guarantees about row selection or ordering. DataFrame.peek(force=False) is much faster than DataFrame.peek, but will only succeed in the @@ -1075,9 +1075,9 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: Args: n (int, default 5): The number of rows to select from the dataframe. Which N rows are returned is non-deterministic. - force (bool, default True): + force (bool, default False): If the data cannot be peeked efficiently, the dataframe will instead be fully materialized as part - of the operation if force=True. I force=False, the operation will throw a NotImplementedError. + of the operation if force=True. If force=False, the operation will throw a NotImplementedError. Returns: pandas.DataFrame: A pandas DataFrame with n rows. From 625fe68475670dc09f6604a350e00df3f0e358a5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 25 Jan 2024 18:06:55 +0000 Subject: [PATCH 5/5] update peek docstring and error type --- bigframes/dataframe.py | 11 ++++++----- tests/system/small/test_dataframe.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6377e7af21..9989831e1b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1069,20 +1069,21 @@ def tail(self, n: int = 5) -> DataFrame: def peek(self, n: int = 5, *, force: bool = False) -> pandas.DataFrame: """ Preview n arbitrary rows from the dataframe. No guarantees about row selection or ordering. - DataFrame.peek(force=False) is much faster than DataFrame.peek, but will only succeed in the - absence of joins, aggregations, and analytic operators. + DataFrame.peek(force=False) will always be very fast, but will not succeed if data requires + full data scanning. Using force=True will always succeed, but may be perform expensive + computations. Args: n (int, default 5): The number of rows to select from the dataframe. Which N rows are returned is non-deterministic. force (bool, default False): If the data cannot be peeked efficiently, the dataframe will instead be fully materialized as part - of the operation if force=True. If force=False, the operation will throw a NotImplementedError. + of the operation if force=True. If force=False, the operation will throw a ValueError. Returns: pandas.DataFrame: A pandas DataFrame with n rows. Raises: - NotImplementedError: If force=False and data cannot be efficiently peeked. + ValueError: If force=False and data cannot be efficiently peeked. """ maybe_result = self._block.try_peek(n) if maybe_result is None: @@ -1091,7 +1092,7 @@ def peek(self, n: int = 5, *, force: bool = False) -> pandas.DataFrame: maybe_result = self._block.try_peek(n) assert maybe_result is not None else: - raise NotImplementedError( + raise ValueError( "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe." ) return maybe_result.set_axis(self._block.column_labels, axis=1, copy=False) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 28126a5134..4ae31fa4a0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -429,7 +429,7 @@ def test_df_peek_filtered(scalars_dfs): def test_df_peek_exception(scalars_dfs): scalars_df, _ = scalars_dfs - with pytest.raises(NotImplementedError): + with pytest.raises(ValueError): # Window ops aren't compatible with efficient peeking scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False)