From 498b5cd16d3159affb2e40c17bb1fb5e26ab696d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 27 Oct 2023 23:10:00 +0000 Subject: [PATCH 01/11] feat: add unordered sql compilation --- bigframes/core/__init__.py | 43 +- bigframes/core/blocks.py | 9 +- bigframes/core/compile/__init__.py | 4 +- bigframes/core/compile/compiled.py | 1906 ++++++++++++------- bigframes/core/compile/compiler.py | 164 +- bigframes/core/compile/concat.py | 100 + bigframes/core/compile/row_identity.py | 70 +- bigframes/core/compile/single_column.py | 103 +- bigframes/dataframe.py | 6 + bigframes/session/__init__.py | 6 +- tests/system/large/ml/test_cluster.py | 4 +- tests/system/large/ml/test_pipeline.py | 4 +- tests/system/large/test_remote_function.py | 28 +- tests/system/small/ml/test_cluster.py | 4 +- tests/system/small/ml/test_core.py | 2 +- tests/system/small/ml/test_decomposition.py | 4 +- tests/system/small/test_dataframe.py | 50 +- tests/system/small/test_dataframe_io.py | 13 +- tests/system/small/test_multiindex.py | 6 +- tests/system/small/test_pandas.py | 8 +- tests/system/small/test_remote_function.py | 26 +- tests/system/small/test_series.py | 10 +- tests/system/utils.py | 25 +- tests/unit/test_core.py | 20 +- 24 files changed, 1706 insertions(+), 909 deletions(-) create mode 100644 bigframes/core/compile/concat.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4653f0ab6a..7f2e231edb 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -23,7 +23,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.compiler as compiler import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -77,7 +78,7 @@ def from_pandas(cls, pd_df: pandas.DataFrame): @property def column_ids(self) -> typing.Sequence[str]: - return self.compile().column_ids + return self._compile().column_ids @property def session(self) -> Session: @@ -87,15 +88,18 @@ def session(self) -> Session: return self.node.session[0] if required_session else get_global_session() def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self.compile().get_column_type(key) + return self._compile().get_column_type(key) - def compile(self) -> compiled.CompiledArrayValue: - return compiled.compile_node(self.node) + def _compile(self) -> compiled.OrderedIR: + return compiler.compile_ordered(self.node) + + def _compile_unordered(self) -> compiled.UnorderedIR: + return compiler.compile_unordered(self.node) def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - width = len(self.compile().columns) - count_expr = self.compile()._to_ibis_expr("unordered").count() + width = len(self._compile().columns) + count_expr = self._compile()._to_ibis_expr(ordering_mode="unordered").count() # Support in-memory engines for hermetic unit tests. if not self.node.session: @@ -120,11 +124,14 @@ def to_sql( col_id_overrides: typing.Mapping[str, str] = {}, sorted: bool = False, ) -> str: - return self.compile().to_sql( - offset_column=offset_column, - col_id_overrides=col_id_overrides, - sorted=sorted, - ) + if sorted or offset_column: + return self._compile().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, + ) + else: + return self._compile_unordered().to_sql(col_id_overrides=col_id_overrides) def start_query( self, @@ -153,8 +160,10 @@ def start_query( def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - compiled = self.compile() - ibis_expr = compiled._to_ibis_expr("unordered", expose_hidden_cols=True) + compiled = self._compile() + ibis_expr = compiled._to_ibis_expr( + ordering_mode="unordered", expose_hidden_cols=True + ) destination = self.session._ibis_to_session_table( ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) @@ -210,12 +219,6 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) ) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: - """Append together multiple ArrayValue objects.""" - return ArrayValue( - nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) - ) - def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cc13edeaf9..589fed4199 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -386,6 +386,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" if max_download_size is None: @@ -412,6 +414,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) return df, query_job @@ -446,12 +449,16 @@ def _compute_and_count( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = expr.start_query(max_results=max_results) + results_iterator, query_job = expr.start_query( + max_results=max_results, sorted=ordered + ) table_size = ( expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index c86f4463dc..af3f32aefb 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,9 +13,9 @@ # limitations under the License. from bigframes.core.compile.compiled import CompiledArrayValue -from bigframes.core.compile.compiler import compile_node +from bigframes.core.compile.compiler import compile_ordered __all__ = [ - "compile_node", + "compile_ordered", "CompiledArrayValue", ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 1134f1aab0..0feecae5b6 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -14,7 +14,6 @@ from __future__ import annotations import functools -import math import textwrap import typing from typing import Collection, Iterable, Literal, Optional, Sequence @@ -32,8 +31,6 @@ ExpressionOrdering, IntegerEncoding, OrderingColumnReference, - reencode_order_string, - StringEncoding, ) import bigframes.core.utils as utils from bigframes.core.window_spec import WindowSpec @@ -45,76 +42,250 @@ PREDICATE_COLUMN = "bigframes_predicate" -class CompiledArrayValue: - """Immutable BigQuery DataFrames expression tree. +class CompiledArrayValue(typing.Protocol): + @property + def column_ids(self) -> typing.Sequence[str]: + ... - Note: Usage of this class is considered to be private and subject to change - at any time. + def to_sql(self) -> str: + ... - This class is a wrapper around Ibis expressions. Its purpose is to defer - Ibis projection operations to keep generated SQL small and correct when - mixing and matching columns from different versions of a DataFrame. + def _to_ibis_expr(self, *args, **kwargs) -> str: + """Exposed for testing purposes only.""" + ... - Args: - table: An Ibis table expression. - columns: Ibis value expressions that can be projected as columns. - hidden_ordering_columns: Ibis value expressions to store ordering. - ordering: An ordering property of the data frame. - predicates: A list of filters on the data frame. - """ + def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + ... + + def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: + return self.select_columns( + [col for col in self.column_ids if col not in columns] + ) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ... + + def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + ... + + def order_by( + self, by: Sequence[OrderingColumnReference], stable: bool = False + ) -> CompiledArrayValue: + ... + + def reversed(self) -> CompiledArrayValue: + ... + + def project_unary_op( + self, column_name: str, op: ops.UnaryOp, output_name=None + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with unary operation applied to one column.""" + ... + + def project_binary_op( + self, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + ... + + def project_ternary_op( + self, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + ... + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> CompiledArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + ... + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> CompiledArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + ... + + def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: + ... + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> CompiledArrayValue: + ... + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> CompiledArrayValue: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + ... + + def _reproject_to_table(self) -> CompiledArrayValue: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + ... + + def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + ... + + # Always ordered operations + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> OrderedIR: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + ... + + def promote_offsets(self, col_id: str): + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + ... + + +class BaseIbisIR: + """Implementation detail, contains common logic between ordered and unordered IR""" def __init__( self, table: ibis_types.Table, columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: ExpressionOrdering = ExpressionOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): self._table = table self._predicates = tuple(predicates) if predicates is not None else () - # TODO: Validate ordering - if not ordering.total_ordering_columns: - raise ValueError("Must have total ordering defined by one or more columns") - self._ordering = ordering # Allow creating a DataFrame directly from an Ibis table expression. # TODO(swast): Validate that each column references the same table (or # no table for literal values). self._columns = tuple(columns) - - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns - self._hidden_ordering_columns = ( - tuple(hidden_ordering_columns) - if hidden_ordering_columns is not None - else () - ) - # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. self._column_names = {column.get_name(): column for column in self._columns} - self._hidden_ordering_column_names = { - column.get_name(): column for column in self._hidden_ordering_columns - } - ### Validation - value_col_ids = self._column_names.keys() - hidden_col_ids = self._hidden_ordering_column_names.keys() - all_columns = value_col_ids | hidden_col_ids - ordering_valid = all( - col.column_id in all_columns for col in ordering.all_ordering_columns + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None ) - if value_col_ids & hidden_col_ids: + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: raise ValueError( - f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + "Column name {} not in set of values: {}".format(key, self.column_ids) ) - if not ordering_valid: - raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + return typing.cast(ibis_types.Value, self._column_names[key]) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_ibis_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + +# Ibis Implementations +class UnorderedIR(BaseIbisIR, CompiledArrayValue): + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + super().__init__(table, columns, predicates) @classmethod - def mem_expr_from_pandas( + def from_pandas( cls, pd_df: pandas.DataFrame, - ) -> CompiledArrayValue: + ) -> UnorderedIR: """ Builds an in-memory only (SQL only) expr from a pandas dataframe. """ @@ -123,7 +294,6 @@ def mem_expr_from_pandas( column_names = [str(column) for column in pd_df.columns] # Make sure column names are all strings. pd_df = pd_df.set_axis(column_names, axis="columns") - pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) # ibis memtable cannot handle NA, must convert to None pd_df = pd_df.astype("object") # type: ignore @@ -134,21 +304,18 @@ def mem_expr_from_pandas( schema = keys_memtable.schema() new_schema = [] for column_index, column in enumerate(schema): - if column == ORDER_ID_COLUMN: - new_type: ibis_dtypes.DataType = ibis_dtypes.int64 - else: - column_type = schema[column] - # The autodetected type might not be one we can support, such - # as NULL type for empty rows, so convert to a type we do - # support. - new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) - ) - # TODO(swast): Ibis memtable doesn't use backticks in struct - # field names, so spaces and other characters aren't allowed in - # the memtable context. Blocked by - # https://github.com/ibis-project/ibis/issues/7187 - column = f"col_{column_index}" + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" new_schema.append((column, new_type)) # must set non-null column labels. these are not the user-facing labels @@ -164,112 +331,108 @@ def mem_expr_from_pandas( keys_memtable[f"col_{column_index}"].name(column) for column_index, column in enumerate(column_names) ], - ordering=ExpressionOrdering( - ordering_value_columns=tuple( - [OrderingColumnReference(ORDER_ID_COLUMN)] - ), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - ), - hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns + def builder(self): + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return UnorderedIR.Builder( + self._table, + columns=self._columns, + predicates=self._predicates, + ) - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) - - @property - def _hidden_column_ids(self) -> typing.Sequence[str]: - return tuple(self._hidden_ordering_column_names.keys()) - - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + if offset_column or sorted: + raise ValueError("Cannot produce sorted sql in unordered mode") + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + col_id_overrides=col_id_overrides, + ) ) + return typing.cast(str, sql) - @property - def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, - ) + def _to_ibis_expr( + self, + *, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + **kwargs, + ): + """ + Creates an Ibis table expression representing the DataFrame. - def builder(self) -> ArrayValueBuilder: - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( - self._table, - columns=self._columns, - hidden_ordering_columns=self._hidden_ordering_columns, - ordering=self._ordering, - predicates=self._predicates, - ) + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. - def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() - ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), - ) + Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + builder = self.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) ) @@ -285,172 +448,20 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayVal def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() expr.predicates = [*self._predicates, predicate_value] return expr.build() def order_by( self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> CompiledArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) - return expr_builder.build() - - def reversed(self) -> CompiledArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_reverse() - return expr_builder.build() - - def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction - ) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> CompiledArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> CompiledArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() - - def promote_offsets(self, col_id: str) -> CompiledArrayValue: - """ - Convenience function to promote copy of column offsets to a value column. Can be used to reset index. - """ - # Special case: offsets already exist - ordering = self._ordering - - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) - expr_builder = self.builder() - expr_builder.columns = [ - self._get_any_column(ordering.total_order_col.column_id).name(col_id), - *self.columns, - ] - return expr_builder.build() - - def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr + ) -> UnorderedIR: + return self - def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: - """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) - return CompiledArrayValue( - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, - ) + def reversed(self) -> UnorderedIR: + return self def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" + ) -> UnorderedIR: value = op._as_ibis(self._get_ibis_column(column_name)).name( output_name or column_name ) @@ -462,8 +473,7 @@ def project_binary_op( right_column_id: str, op: ops.BinaryOp, output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" + ) -> UnorderedIR: value = op( self._get_ibis_column(left_column_id), self._get_ibis_column(right_column_id), @@ -477,8 +487,7 @@ def project_ternary_op( col_id_3: str, op: ops.TernaryOp, output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" + ) -> UnorderedIR: value = op( self._get_ibis_column(col_id_1), self._get_ibis_column(col_id_2), @@ -486,20 +495,140 @@ def project_ternary_op( ).name(output_column_id) return self._set_or_replace_by_id(output_column_id, value) + def assign(self, source_id: str, destination_id: str) -> UnorderedIR: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> UnorderedIR: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> UnorderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr() + row_n = len(row_labels) + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + unpivot_offset_id, + ) + + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + return UnorderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + ) + def aggregate( self, aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], by_column_ids: typing.Sequence[str] = (), dropna: bool = True, - ) -> CompiledArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - table = self._to_ibis_expr("unordered") + ) -> OrderedIR: + table = self._to_ibis_expr() stats = { col_out: agg_op._as_ibis(table[col_in]) for col_in, agg_op, col_out in aggregations @@ -514,117 +643,621 @@ def aggregate( for column_id in by_column_ids ] ), - total_ordering_columns=frozenset(by_column_ids), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = OrderedIR(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> OrderedIR: + table = self._to_ibis_expr() + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return OrderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _uniform_sampling(self, fraction: float) -> UnorderedIR: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr(fraction=fraction) + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + # Unsupported operations, need ordering + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> OrderedIR: + raise ValueError("Window ops must be compiled in ordered mode") + + def promote_offsets(self, col_id: str): + raise ValueError("Window ops must be compiled in ordered mode") + + ## Helpers + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> UnorderedIR: + """Safely assign by id while maintaining ordering integrity.""" + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + def _reproject_to_table(self) -> UnorderedIR: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr() + columns = [table[column_name] for column_name in self._column_names] + return UnorderedIR( + table, + columns=columns, + ) + + class Builder: + def __init__( + self, + table: ibis_types.Table, + columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> UnorderedIR: + return UnorderedIR( + table=self.table, + columns=self.columns, + predicates=self.predicates, + ) + + +class OrderedIR(BaseIbisIR, CompiledArrayValue): + """Immutable BigQuery DataFrames expression tree. + + Note: Usage of this class is considered to be private and subject to change + at any time. + + This class is a wrapper around Ibis expressions. Its purpose is to defer + Ibis projection operations to keep generated SQL small and correct when + mixing and matching columns from different versions of a DataFrame. + + Args: + table: An Ibis table expression. + columns: Ibis value expressions that can be projected as columns. + hidden_ordering_columns: Ibis value expressions to store ordering. + ordering: An ordering property of the data frame. + predicates: A list of filters on the data frame. + """ + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, + ordering: ExpressionOrdering = ExpressionOrdering(), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + super().__init__(table, columns, predicates) + # TODO: Validate ordering + if not ordering.total_ordering_columns: + raise ValueError("Must have total ordering defined by one or more columns") + self._ordering = ordering + # Meta columns store ordering, or other data that doesn't correspond to dataframe columns + self._hidden_ordering_columns = ( + tuple(hidden_ordering_columns) + if hidden_ordering_columns is not None + else () + ) + + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + self._hidden_ordering_column_names = { + column.get_name(): column for column in self._hidden_ordering_columns + } + ### Validation + value_col_ids = self._column_names.keys() + hidden_col_ids = self._hidden_ordering_column_names.keys() + + all_columns = value_col_ids | hidden_col_ids + ordering_valid = all( + col.column_id in all_columns for col in ordering.all_ordering_columns + ) + if value_col_ids & hidden_col_ids: + raise ValueError( + f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + ) + if not ordering_valid: + raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + + @classmethod + def from_pandas( + cls, + pd_df: pandas.DataFrame, + ) -> OrderedIR: + """ + Builds an in-memory only (SQL only) expr from a pandas dataframe. + """ + # We can't include any hidden columns in the ArrayValue constructor, so + # grab the column names before we add the hidden ordering column. + column_names = [str(column) for column in pd_df.columns] + # Make sure column names are all strings. + pd_df = pd_df.set_axis(column_names, axis="columns") + pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + + # ibis memtable cannot handle NA, must convert to None + pd_df = pd_df.astype("object") # type: ignore + pd_df = pd_df.where(pandas.notnull(pd_df), None) + + # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. + keys_memtable = ibis.memtable(pd_df) + schema = keys_memtable.schema() + new_schema = [] + for column_index, column in enumerate(schema): + if column == ORDER_ID_COLUMN: + new_type: ibis_dtypes.DataType = ibis_dtypes.int64 + else: + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" + new_schema.append((column, new_type)) + + # must set non-null column labels. these are not the user-facing labels + pd_df = pd_df.set_axis( + [column for column, _ in new_schema], + axis="columns", + ) + keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + + return cls( + keys_memtable, + columns=[ + keys_memtable[f"col_{column_index}"].name(column) + for column_index, column in enumerate(column_names) + ], + ordering=ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + ), + hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), + ) + + @property + def _hidden_column_ids(self) -> typing.Sequence[str]: + return tuple(self._hidden_ordering_column_names.keys()) + + @property + def _ibis_order(self) -> Sequence[ibis_types.Value]: + """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" + return _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + self._ordering.all_ordering_columns, + ) + + def to_unordered(self) -> UnorderedIR: + return UnorderedIR(self._table, self._columns, self._predicates) + + def builder(self) -> OrderedIR.Builder: + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return OrderedIR.Builder( + self._table, + columns=self._columns, + hidden_ordering_columns=self._hidden_ordering_columns, + ordering=self._ordering, + predicates=self._predicates, + ) + + def order_by( + self, by: Sequence[OrderingColumnReference], stable: bool = False + ) -> OrderedIR: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + return expr_builder.build() + + def reversed(self) -> OrderedIR: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_reverse() + return expr_builder.build() + + def project_unary_op( + self, column_name: str, op: ops.UnaryOp, output_name=None + ) -> OrderedIR: + value = op._as_ibis(self._get_ibis_column(column_name)).name( + output_name or column_name + ) + return self._set_or_replace_by_id(output_name or column_name, value) + + def project_binary_op( + self, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> OrderedIR: + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> OrderedIR: + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def assign(self, source_id: str, destination_id: str) -> OrderedIR: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> OrderedIR: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + def _uniform_sampling(self, fraction: float) -> OrderedIR: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr( + ordering_mode="unordered", expose_hidden_cols=True, fraction=fraction + ) + columns = [table[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + table[column_name] for column_name in self._hidden_ordering_column_names + ] + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + def promote_offsets(self, col_id: str) -> OrderedIR: + # Special case: offsets already exist + ordering = self._ordering + + if (not ordering.is_sequential) or (not ordering.total_order_col): + return self._project_offsets().promote_offsets(col_id) + expr_builder = self.builder() + expr_builder.columns = [ + self._get_any_column(ordering.total_order_col.column_id).name(col_id), + *self.columns, + ] + return expr_builder.build() + + def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + expr = self + for ordering_column in set(self.column_ids).intersection( + [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> OrderedIR: + return self.to_unordered().aggregate(aggregations, by_column_ids, dropna) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> OrderedIR: + return self.to_unordered().corr_aggregate(corr_aggregations) + + ## Methods that only work with ordering + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> OrderedIR: + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) + window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) + + window_op = op._as_ibis(column, window) + + clauses = [] + if op.skips_nulls and not never_skip_nulls: + clauses.append((column.isnull(), ibis.NA)) + if window_spec.min_periods: + if op.skips_nulls: + # Most operations do not count NULL values towards min_periods + observation_count = agg_ops.count_op._as_ibis(column, window) + else: + # Operations like count treat even NULLs as valid observations for the sake of min_periods + # notnull is just used to convert null values to non-null (FALSE) values to be counted + denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + observation_count = agg_ops.count_op._as_ibis(denulled_value, window) + clauses.append( + ( + observation_count < ibis_types.literal(window_spec.min_periods), + ibis.NA, + ) + ) + if clauses: + case_statement = ibis.case() + for clause in clauses: + case_statement = case_statement.when(clause[0], clause[1]) + case_statement = case_statement.else_(window_op).end() + window_op = case_statement + + result = self._set_or_replace_by_id(output_name or column_name, window_op) + # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. + return result._reproject_to_table() if not skip_reproject_unsafe else result + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> OrderedIR: + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), ) - columns = tuple(result[key] for key in result.columns) - expr = CompiledArrayValue(result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( + else: # how=="right" + new_ordering = ExpressionOrdering( ordering_value_columns=tuple( - [OrderingColumnReference(ORDER_ID_COLUMN)] + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] ), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return OrderedIR( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> CompiledArrayValue: - """ - Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. - This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. - Arguments: - corr_aggregations: left_column_id, right_column_id, output_column_id tuples - """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + def _reproject_to_table(self) -> OrderedIR: + table = self._to_ibis_expr( + ordering_mode="unordered", + expose_hidden_cols=True, ) - return CompiledArrayValue( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, ) - def project_window_op( - self, - column_name: str, - op: agg_ops.WindowOp, - window_spec: WindowSpec, - output_name=None, - *, - never_skip_nulls=False, - skip_reproject_unsafe: bool = False, - ) -> CompiledArrayValue: - """ - Creates a new expression based on this expression with unary operation applied to one column. - column_name: the id of the input column present in the expression - op: the windowable operator to apply to the input column - window_spec: a specification of the window over which to apply the operator - output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided - never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection - """ - column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) - window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) - - window_op = op._as_ibis(column, window) - - clauses = [] - if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis.NA)) - if window_spec.min_periods: - if op.skips_nulls: - # Most operations do not count NULL values towards min_periods - observation_count = agg_ops.count_op._as_ibis(column, window) - else: - # Operations like count treat even NULLs as valid observations for the sake of min_periods - # notnull is just used to convert null values to non-null (FALSE) values to be counted - denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) - observation_count = agg_ops.count_op._as_ibis(denulled_value, window) - clauses.append( - ( - observation_count < ibis_types.literal(window_spec.min_periods), - ibis.NA, - ) - ) - if clauses: - case_statement = ibis.case() - for clause in clauses: - case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement - - result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result - def to_sql( self, offset_column: typing.Optional[str] = None, @@ -654,11 +1287,13 @@ def to_sql( def _to_ibis_expr( self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, + *, expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, + **kwargs, ): """ Creates an Ibis table expression representing the DataFrame. @@ -677,16 +1312,16 @@ def _to_ibis_expr( column name will be 'bigframes_ordering_id' Args: + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. ordering_mode: How to construct the Ibis expression from the ArrayValue. See above for details. order_col_name: If the ordering mode outputs a single ordering or offsets column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. col_id_overrides: overrides the column ids for the result Returns: @@ -716,27 +1351,122 @@ def _to_ibis_expr( order_columns = self._create_order_columns( ordering_mode, order_col_name, expose_hidden_cols ) - columns.extend(order_columns) - - # Special case for empty tables, since we can't create an empty - # projection. - if not columns: - return ibis.memtable([]) - - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + columns.extend(order_columns) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + ## Ordering specific helpers + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def _hide_column(self, column_id) -> OrderedIR: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def _project_offsets(self) -> OrderedIR: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return OrderedIR( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table def _create_order_columns( self, @@ -789,34 +1519,6 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: ) return encode_order_string(row_nums) - def _reproject_to_table(self) -> CompiledArrayValue: - """ - Internal operators that projects the internal representation into a - new ibis table expression where each value column is a direct - reference to a column in that table expression. Needed after - some operations such as window operations that cannot be used - recursively in projections. - """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] - return CompiledArrayValue( - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): group_by: typing.List[ibis_types.Value] = ( [ @@ -851,229 +1553,29 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> CompiledArrayValue: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) + class Builder: + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] - return CompiledArrayValue( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, - ) - - def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> CompiledArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> OrderedIR: + return OrderedIR( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id( - self, id: str, new_value: ibis_types.Value - ) -> CompiledArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( - self, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> CompiledArrayValue: - return CompiledArrayValue( - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) def _reduce_predicate_list( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 195d830122..5959695b30 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -19,7 +19,8 @@ import pandas as pd -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled +import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.single_column import bigframes.core.nodes as nodes @@ -28,120 +29,160 @@ import bigframes.session +def compile_ordered(node: nodes.BigFrameNode) -> compiled.OrderedIR: + return typing.cast(compiled.OrderedIR, compile_node(node, True)) + + +def compile_unordered(node: nodes.BigFrameNode) -> compiled.UnorderedIR: + return typing.cast(compiled.UnorderedIR, compile_node(node, False)) + + @functools.cache -def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR | compiled.OrderedIR: """Compile node into CompileArrayValue. Caches result.""" - return _compile_node(node) + return _compile_node(node, ordered) @functools.singledispatch -def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: +def _compile_node( + node: nodes.BigFrameNode, ordered: bool = True +) -> compiled.UnorderedIR: """Defines transformation but isn't cached, always use compile_node instead""" raise ValueError(f"Can't compile unnrecognized node: {node}") @_compile_node.register -def compile_join(node: nodes.JoinNode): - compiled_left = compile_node(node.left_child) - compiled_right = compile_node(node.right_child) - return bigframes.core.compile.single_column.join_by_column( - compiled_left, - node.left_column_ids, - compiled_right, - node.right_column_ids, - how=node.how, - allow_row_identity_join=node.allow_row_identity_join, - ) +def compile_join(node: nodes.JoinNode, ordered: bool = True): + if ordered: + left_ordered = compile_ordered(node.left_child) + right_ordered = compile_ordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_ordered( + left_ordered, + node.left_column_ids, + right_ordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + else: + left_unordered = compile_unordered(node.left_child) + right_unordered = compile_unordered(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left_unordered, + node.left_column_ids, + right_unordered, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) @_compile_node.register -def compile_select(node: nodes.SelectNode): - return compile_node(node.child).select_columns(node.column_ids) +def compile_select(node: nodes.SelectNode, ordered: bool = True): + return compile_node(node.child, ordered).select_columns(node.column_ids) @_compile_node.register -def compile_drop(node: nodes.DropColumnsNode): - return compile_node(node.child).drop_columns(node.columns) +def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): + return compile_node(node.child, ordered).drop_columns(node.columns) @_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode): +def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + if ordered: + return compiled.OrderedIR.from_pandas(array_as_pd) + else: + return compiled.UnorderedIR.from_pandas(array_as_pd) @_compile_node.register -def compile_readgbq(node: nodes.ReadGbqNode): - return compiled.CompiledArrayValue( - node.table, - node.columns, - node.hidden_ordering_columns, - node.ordering, - ) +def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): + if ordered: + return compiled.OrderedIR( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + else: + return compiled.UnorderedIR( + node.table, + node.columns, + ) @_compile_node.register -def compile_promote_offsets(node: nodes.PromoteOffsetsNode): - return compile_node(node.child).promote_offsets(node.col_id) +def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): + result = compile_node(node.child, True).promote_offsets(node.col_id) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_filter(node: nodes.FilterNode): - return compile_node(node.child).filter(node.predicate_id, node.keep_null) +def compile_filter(node: nodes.FilterNode, ordered: bool = True): + return compile_node(node.child, ordered).filter(node.predicate_id, node.keep_null) @_compile_node.register -def compile_orderby(node: nodes.OrderByNode): - return compile_node(node.child).order_by(node.by, node.stable) +def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): + return compile_node(node.child, ordered).order_by(node.by, node.stable) @_compile_node.register -def compile_reversed(node: nodes.ReversedNode): - return compile_node(node.child).reversed() +def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): + return compile_node(node.child, ordered).reversed() @_compile_node.register -def compile_project_unary(node: nodes.ProjectUnaryOpNode): - return compile_node(node.child).project_unary_op( +def compile_project_unary(node: nodes.ProjectUnaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_unary_op( node.input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_binary(node: nodes.ProjectBinaryOpNode): - return compile_node(node.child).project_binary_op( +def compile_project_binary(node: nodes.ProjectBinaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_binary_op( node.left_input_id, node.right_input_id, node.op, node.output_id ) @_compile_node.register -def compile_project_ternary(node: nodes.ProjectTernaryOpNode): - return compile_node(node.child).project_ternary_op( +def compile_project_ternary(node: nodes.ProjectTernaryOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_ternary_op( node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id ) @_compile_node.register -def compile_concat(node: nodes.ConcatNode): - compiled_nodes = [compile_node(node) for node in node.children] - return compiled_nodes[0].concat(compiled_nodes[1:]) +def compile_concat(node: nodes.ConcatNode, ordered: bool = True): + if ordered: + compiled_ordered = [compile_ordered(node) for node in node.children] + return concat_impl.concat_ordered(compiled_ordered) + else: + compiled_unordered = [compile_unordered(node) for node in node.children] + return concat_impl.concat_unordered(compiled_unordered) @_compile_node.register -def compile_aggregate(node: nodes.AggregateNode): - return compile_node(node.child).aggregate( +def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): + result = compile_node(node.child, False).aggregate( node.aggregations, node.by_column_ids, node.dropna ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_corr(node: nodes.CorrNode): - return compile_node(node.child).corr_aggregate(node.corr_aggregations) +def compile_corr(node: nodes.CorrNode, ordered: bool = True): + result = compile_node(node.child, False).corr_aggregate(node.corr_aggregations) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_window(node: nodes.WindowOpNode): - return compile_node(node.child).project_window_op( +def compile_window(node: nodes.WindowOpNode, ordered: bool = True): + result = compile_node(node.child, True).project_window_op( node.column_name, node.op, node.window_spec, @@ -149,16 +190,17 @@ def compile_window(node: nodes.WindowOpNode): never_skip_nulls=node.never_skip_nulls, skip_reproject_unsafe=node.skip_reproject_unsafe, ) + return result if ordered else result.to_unordered() @_compile_node.register -def compile_reproject(node: nodes.ReprojectOpNode): - return compile_node(node.child)._reproject_to_table() +def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): + return compile_node(node.child, ordered)._reproject_to_table() @_compile_node.register -def compile_unpivot(node: nodes.UnpivotNode): - return compile_node(node.child).unpivot( +def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): + return compile_node(node.child, ordered).unpivot( node.row_labels, node.unpivot_columns, passthrough_columns=node.passthrough_columns, @@ -169,17 +211,17 @@ def compile_unpivot(node: nodes.UnpivotNode): @_compile_node.register -def compile_assign(node: nodes.AssignNode): - return compile_node(node.child).assign(node.source_id, node.destination_id) +def compile_assign(node: nodes.AssignNode, ordered: bool = True): + return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) @_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode): - return compile_node(node.child).assign_constant( +def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): + return compile_node(node.child, ordered).assign_constant( node.destination_id, node.value, node.dtype ) @_compile_node.register -def compiler_random_sample(node: nodes.RandomSampleNode): - return compile_node(node.child)._uniform_sampling(node.fraction) +def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): + return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py new file mode 100644 index 0000000000..d39569370e --- /dev/null +++ b/bigframes/core/compile/concat.py @@ -0,0 +1,100 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import typing + +import ibis + +import bigframes.core.compile.compiled as compiled +from bigframes.core.ordering import ( + ExpressionOrdering, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) + +ORDER_ID_COLUMN = "bigframes_ordering_id" + + +def concat_unordered( + items: typing.Sequence[compiled.UnorderedIR], +) -> compiled.UnorderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + tables = [] + for expr in items: + table = expr._to_ibis_expr() + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [table[col].name(f"column_{i}") for i, col in enumerate(table.columns)] + ) + tables.append(table) + combined_table = ibis.union(*tables) + return compiled.UnorderedIR( + combined_table, + columns=[combined_table[col] for col in combined_table.columns], + ) + + +def concat_ordered( + items: typing.Sequence[compiled.OrderedIR], +) -> compiled.OrderedIR: + """Append together multiple ArrayValue objects.""" + if len(items) == 1: + return items[0] + + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(items), prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + *[expression._ordering.string_encoding.length for expression in items], + ) + for i, expr in enumerate(items): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string(table[ORDER_ID_COLUMN], max_encoding_size) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return compiled.OrderedIR( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index 2e9bc0527c..71d53f90dc 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,16 +23,76 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.joins.name_resolution as naming import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} -def join_by_row_identity( - left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str -) -> compiled.CompiledArrayValue: +def join_by_row_identity_unordered( + left: compiled.UnorderedIR, + right: compiled.UnorderedIR, + *, + how: str, +) -> compiled.UnorderedIR: + """Compute join when we are joining by row identity not a specific column.""" + if how not in SUPPORTED_ROW_IDENTITY_HOW: + raise NotImplementedError( + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + ) + + if not left._table.equals(right._table): + raise ValueError( + "Cannot combine objects without an explicit join/merge key. " + f"Left based on: {left._table.compile()}, but " + f"right based on: {right._table.compile()}" + ) + + left_predicates = left._predicates + right_predicates = right._predicates + # TODO(tbergeron): Skip generating these for inner part of join + ( + left_relative_predicates, + right_relative_predicates, + ) = _get_relative_predicates(left_predicates, right_predicates) + + combined_predicates = [] + if left_predicates or right_predicates: + joined_predicates = _join_predicates( + left_predicates, right_predicates, join_type=how + ) + combined_predicates = list(joined_predicates) # builder expects mutable list + + left_mask = left_relative_predicates if how in ["right", "outer"] else None + right_mask = right_relative_predicates if how in ["left", "outer"] else None + + # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + joined_columns = [ + _mask_value(left._get_ibis_column(key), left_mask).name(map_left_id[key]) + for key in left.column_ids + ] + [ + _mask_value(right._get_ibis_column(key), right_mask).name(map_right_id[key]) + for key in right.column_ids + ] + + joined_expr = compiled.UnorderedIR( + left._table, + columns=joined_columns, + predicates=combined_predicates, + ) + return joined_expr + + +def join_by_row_identity_ordered( + left: compiled.OrderedIR, + right: compiled.OrderedIR, + *, + how: str, +) -> compiled.OrderedIR: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -118,7 +178,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = compiled.CompiledArrayValue( + joined_expr = compiled.OrderedIR( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index b992aa1d1d..cf206ae95f 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core.compile as compiled +import bigframes.core.compile.compiled as compiled import bigframes.core.compile.row_identity import bigframes.core.joins as joining import bigframes.core.ordering as orderings -def join_by_column( - left: compiled.CompiledArrayValue, +def join_by_column_ordered( + left: compiled.OrderedIR, left_column_ids: typing.Sequence[str], - right: compiled.CompiledArrayValue, + right: compiled.OrderedIR, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -42,7 +42,7 @@ def join_by_column( "right", ], allow_row_identity_join: bool = True, -) -> compiled.CompiledArrayValue: +) -> compiled.OrderedIR: """Join two expressions by column equality. Arguments: @@ -67,13 +67,13 @@ def join_by_column( # regards to value its possible that they both have the same names but # were modified in different ways. Ignore differences in the names. and all( - left._get_any_column(lcol) + left._get_ibis_column(lcol) .name("index") - .equals(right._get_any_column(rcol).name("index")) + .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.compile.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity_ordered( left, right, how=how ) else: @@ -88,12 +88,12 @@ def join_by_column( r_mapping = {**r_public_mapping, **r_hidden_mapping} left_table = left._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=l_mapping, ) right_table = right._to_ibis_expr( - "unordered", + ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=r_mapping, ) @@ -134,7 +134,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return compiled.CompiledArrayValue( + return compiled.OrderedIR( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -142,6 +142,87 @@ def join_by_column( ) +def join_by_column_unordered( + left: compiled.UnorderedIR, + left_column_ids: typing.Sequence[str], + right: compiled.UnorderedIR, + right_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + ], + allow_row_identity_join: bool = True, +) -> compiled.UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + left_column_ids: Column IDs (not label) to join by. + right: Expression for right table to join. + right_column_ids: Column IDs (not label) to join by. + how: The type of join to perform. + allow_row_identity_join (bool): + If True, allow matching by row identity. Set to False to always + perform a true JOIN in generated SQL. + Returns: + The joined expression. The resulting columns will be, in order, + first the coalesced join keys, then, all the left columns, and + finally, all the right columns. + """ + if ( + allow_row_identity_join + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and left._table.equals(right._table) + # Make sure we're joining on exactly the same column(s), at least with + # regards to value its possible that they both have the same names but + # were modified in different ways. Ignore differences in the names. + and all( + left._get_ibis_column(lcol) + .name("index") + .equals(right._get_ibis_column(rcol).name("index")) + for lcol, rcol in zip(left_column_ids, right_column_ids) + ) + ): + return bigframes.core.compile.row_identity.join_by_row_identity_unordered( + left, right, how=how + ) + else: + # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result + l_mapping, r_mapping = joining.JOIN_NAME_REMAPPER( + left.column_ids, right.column_ids + ) + left_table = left._to_ibis_expr( + col_id_overrides=l_mapping, + ) + right_table = right._to_ibis_expr( + col_id_overrides=r_mapping, + ) + join_conditions = [ + value_to_join_key(left_table[l_mapping[left_index]]) + == value_to_join_key(right_table[r_mapping[right_index]]) + for left_index, right_index in zip(left_column_ids, right_column_ids) + ] + + combined_table = ibis.join( + left_table, + right_table, + predicates=join_conditions, + how=how, + ) + # We could filter out the original join columns, but predicates/ordering + # might still reference them in implicit joins. + columns = [ + combined_table[l_mapping[col.get_name()]] for col in left.columns + ] + [combined_table[r_mapping[col.get_name()]] for col in right.columns] + return compiled.UnorderedIR( + combined_table, + columns=columns, + ) + + def value_to_join_key(value: ibis_types.Value): """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" if not value.type().is_string(): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 01117d3e0a..db68033c51 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -857,6 +857,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -876,6 +878,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting dataframe will be sorted. In some cases, + unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -887,6 +892,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 473de62f53..8661678588 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -599,10 +599,8 @@ def _read_gbq_table( is_total_ordering = True ordering = orderings.ExpressionOrdering( ordering_value_columns=tuple( - [ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ] + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols ), total_ordering_columns=frozenset(total_ordering_cols), ) diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index eae6896669..cef167d4ac 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -16,7 +16,7 @@ import pytest from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.mark.flaky(retries=2, delay=120) @@ -105,7 +105,7 @@ def test_cluster_configure_fit_score_predict( index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected) # save, load, check n_clusters to ensure configuration was kept reloaded_model = model.to_gbq( diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 6874a9f301..3197320047 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -24,7 +24,7 @@ pipeline, preprocessing, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal def test_pipeline_linear_regression_fit_score_predict( @@ -555,7 +555,7 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( ), ) expected.index.name = "observation" - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected) def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 730a1dbde4..57188b0470 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -31,7 +31,7 @@ get_cloud_function_name, get_remote_function_locations, ) -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per @@ -356,7 +356,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -400,7 +400,7 @@ def add_one(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -445,7 +445,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, square) @@ -496,7 +496,7 @@ def sign(num): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets(session.bqclient, functions_client, remote_sign) @@ -541,7 +541,7 @@ def circumference(radius): pd_result_col = pd_result_col.astype(pandas.Float64Dtype()) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -590,7 +590,7 @@ def find_team(num): pd_result_col = pd_result_col.astype(pandas.StringDtype(storage="pyarrow")) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -674,7 +674,7 @@ def inner_test(): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Test that the remote function works as expected inner_test() @@ -764,7 +764,7 @@ def is_odd(num): pd_result_col = pd_int64_col.mask(is_odd) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -807,7 +807,7 @@ def is_odd(num): pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -851,7 +851,7 @@ def test_remote_udf_lambda( pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -908,7 +908,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -953,7 +953,7 @@ def pd_np_foo(x): # comparing for the purpose of this test pd_result.result = pd_result.result.astype(pandas.Float64Dtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_remote_function_assets( @@ -997,7 +997,7 @@ def test_internal(rf, udf): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Create an explicit name for the remote function prefixer = test_utils.prefixer.Prefixer("foo", "") diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index d95a1e1bc2..a3af71892f 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -15,7 +15,7 @@ import pandas as pd from bigframes.ml import cluster -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( { @@ -68,7 +68,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): dtype="Int64", index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - assert_pandas_df_equal_ignore_ordering(result, expected) + assert_pandas_df_equal(result, expected) def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index f911dd7eeb..cb6507e4e3 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -225,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index e31681f4a0..b46b3d103d 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -130,7 +130,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, @@ -149,7 +149,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - tests.system.utils.assert_pandas_df_equal_ignore_ordering( + tests.system.utils.assert_pandas_df_equal( result, expected, check_exact=False, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e459e3bee3..9494723ef7 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -29,7 +29,7 @@ import bigframes.dataframe as dataframe import bigframes.series as series from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, + assert_pandas_df_equal, assert_series_equal_ignoring_order, ) @@ -246,7 +246,7 @@ def test_drop_with_custom_column_labels(scalars_dfs): pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_drop_index(scalars_dfs): @@ -420,7 +420,7 @@ def test_filter_df(scalars_dfs): pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column(scalars_dfs): @@ -433,7 +433,7 @@ def test_assign_new_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_new_column_w_loc(scalars_dfs): @@ -564,7 +564,7 @@ def test_assign_existing_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_series(scalars_dfs): @@ -574,7 +574,7 @@ def test_assign_series(scalars_dfs): bf_result = df.to_pandas() pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_series_overwrite(scalars_dfs): @@ -586,7 +586,7 @@ def test_assign_series_overwrite(scalars_dfs): **{column_name: scalars_pandas_df[column_name] + 3} ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_sequential(scalars_dfs): @@ -601,7 +601,7 @@ def test_assign_sequential(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # Require an index so that the self-join is consistent each time. @@ -635,7 +635,7 @@ def test_assign_different_df( new_col=scalars_pandas_df_index[column_name] ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_assign_different_df_w_loc( @@ -686,7 +686,7 @@ def test_assign_callable_lambda(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -852,7 +852,7 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -885,7 +885,7 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -915,7 +915,7 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -948,7 +948,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_get_dtypes(scalars_df_default_index): @@ -1240,7 +1240,7 @@ def test_df_abs(scalars_dfs): bf_result = scalars_df[columns].abs().to_pandas() pd_result = scalars_pandas_df[columns].abs() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_isnull(scalars_dfs): @@ -1257,7 +1257,7 @@ def test_df_isnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_df_notnull(scalars_dfs): @@ -1274,7 +1274,7 @@ def test_df_notnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1494,7 +1494,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize(("other_scalar"), [1, -2]) @@ -1506,7 +1506,7 @@ def test_mod(scalars_dfs, other_scalar): bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_scalar_binop_str_exception(scalars_dfs): @@ -1562,7 +1562,7 @@ def test_series_binop_axis_index( bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1623,7 +1623,7 @@ def test_series_binop_add_different_table( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) # TODO(garrettwu): Test series binop with different index @@ -1649,7 +1649,7 @@ def test_join_same_table(scalars_dfs, how): pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]] pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -1662,7 +1662,7 @@ def test_join_different_table( pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): @@ -1686,7 +1686,7 @@ def test_join_param_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -2553,7 +2553,7 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result, pd_result, check_names=False, diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 8f5d706f62..a235845937 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -19,10 +19,7 @@ import pyarrow as pa import pytest -from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, - convert_pandas_dtypes, -) +from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes try: import pandas_gbq # type: ignore @@ -380,7 +377,7 @@ def test_to_sql_query_unnamed_index_included( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_named_index_included( @@ -397,7 +394,7 @@ def test_to_sql_query_named_index_included( pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( @@ -412,7 +409,7 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_named_index_excluded( @@ -429,4 +426,4 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a87dacae04..4eadb6fe86 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal # Row Multi-index tests @@ -429,7 +429,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how): (["bool_col", "rowindex_2"]) )[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -450,7 +450,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index f8fa78587f..56bea42ad5 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal def test_concat_dataframe(scalars_dfs): @@ -140,7 +140,7 @@ def test_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -174,7 +174,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -208,7 +208,7 @@ def test_merge_series(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_cut(scalars_dfs): diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d024a57ded..853ddff78a 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -21,7 +21,7 @@ import bigframes from bigframes import remote_function as rf import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal_ignore_ordering +from tests.system.utils import assert_pandas_df_equal @pytest.fixture(scope="module") @@ -155,7 +155,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -204,7 +204,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -280,7 +280,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -341,7 +341,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -378,7 +378,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -417,7 +417,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -463,7 +463,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -493,7 +493,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -520,7 +520,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -545,7 +545,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -631,7 +631,7 @@ def square1(x): s2_result_col = int64_col_filtered.apply(square2) s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) - assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas()) + assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) @pytest.mark.flaky(retries=2, delay=120) @@ -679,7 +679,7 @@ def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id): indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) indirect_df = indirect_df.to_pandas() - assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df) + assert_pandas_df_equal(direct_df, indirect_df) @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 05d8b84185..5e494fbd21 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -25,7 +25,7 @@ import bigframes.pandas import bigframes.series as series from tests.system.utils import ( - assert_pandas_df_equal_ignore_ordering, + assert_pandas_df_equal, assert_series_equal_ignoring_order, ) @@ -2256,7 +2256,7 @@ def test_to_frame(scalars_dfs): bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_to_json(scalars_df_index, scalars_pandas_df_index): @@ -2424,7 +2424,7 @@ def test_mask_default_value(scalars_dfs): pd_col_masked = pd_col.mask(pd_col % 2 == 1) pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) def test_mask_custom_value(scalars_dfs): @@ -2442,7 +2442,7 @@ def test_mask_custom_value(scalars_dfs): # odd so should be left as is, but it is being masked in pandas. # Accidentally the bigframes bahavior matches, but it should be updated # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2538,7 +2538,7 @@ def test_loc_bool_series_default_index( scalars_pandas_df_default_index.bool_col ] - assert_pandas_df_equal_ignore_ordering( + assert_pandas_df_equal( bf_result.to_frame(), pd_result.to_frame(), ) diff --git a/tests/system/utils.py b/tests/system/utils.py index e2daf3b8bf..c68acf10f3 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -21,18 +21,19 @@ import pyarrow as pa # type: ignore -def assert_pandas_df_equal_ignore_ordering(df0, df1, **kwargs): - # Sort by a column to get consistent results. - if df0.index.name != "rowindex": - df0 = df0.sort_values( - list(df0.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - df1 = df1.sort_values( - list(df1.columns.drop("geography_col", errors="ignore")) - ).reset_index(drop=True) - else: - df0 = df0.sort_index() - df1 = df1.sort_index() +def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): + if ignore_order: + # Sort by a column to get consistent results. + if df0.index.name != "rowindex": + df0 = df0.sort_values( + list(df0.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + df1 = df1.sort_values( + list(df1.columns.drop("geography_col", errors="ignore")) + ).reset_index(drop=True) + else: + df0 = df0.sort_index() + df1 = df1.sort_index() pd.testing.assert_frame_equal(df0, df1, **kwargs) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index d9672b2635..e7026ebd87 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -49,7 +49,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ordering=ordering, hidden_ordering_columns=(), ) - assert actual.compile()._table is ibis_table + assert actual._compile()._table is ibis_table assert len(actual.column_ids) == 3 @@ -83,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value.compile()._get_ibis_column("col1") + col1 = value._compile()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -100,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value.compile()._get_ibis_column("col1") + expr = value._compile()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr.compile()._to_ibis_expr("unordered") + actual = expr._compile()._to_ibis_expr("unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -136,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() - assert value.compile().columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile() + assert value._compile().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,7 +152,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() + expr = value.project_binary_op("col2", "col3", ops.add_op, "col4")._compile() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 4 @@ -173,7 +173,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op( "col2", "col3", "col4", ops.where_op, "col5" - ).compile() + )._compile() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 5 @@ -195,7 +195,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ).compile() + )._compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" @@ -214,7 +214,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() + expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")])._compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" From 5aa31372f9a2d22e4d4265f1e7b646b9239ed2b0 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 28 Oct 2023 00:49:46 +0000 Subject: [PATCH 02/11] add tests --- bigframes/core/__init__.py | 6 ++ bigframes/series.py | 3 + .../system/small/operations/test_datetimes.py | 22 +++--- tests/system/small/operations/test_strings.py | 42 +++++------ tests/system/small/test_dataframe.py | 26 ++++--- tests/system/small/test_series.py | 73 +++++++++---------- tests/system/utils.py | 17 +++-- tests/unit/test_core.py | 10 +-- 8 files changed, 107 insertions(+), 92 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 7f2e231edb..d36a50ff37 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -219,6 +219,12 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) ) + def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: + """Append together multiple ArrayValue objects.""" + return ArrayValue( + nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) + ) + def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: diff --git a/bigframes/series.py b/bigframes/series.py index 37d00d16f3..5b22756d19 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -265,6 +265,8 @@ def to_pandas( max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, + *, + ordered: bool = True, ) -> pandas.Series: """Writes Series to pandas Series. @@ -294,6 +296,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, + ordered=ordered, ) self._set_internal_query_job(query_job) series = df[self._value_column] diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 7dc55b9367..177194c7a8 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -16,7 +16,7 @@ import pytest import bigframes.series -from tests.system.utils import assert_series_equal_ignoring_order +from tests.system.utils import assert_series_equal DATETIME_COL_NAMES = [("datetime_col",), ("timestamp_col",)] @@ -33,7 +33,7 @@ def test_day(scalars_dfs, col_name): bf_result = bf_series.dt.day.to_pandas() pd_result = scalars_pandas_df[col_name].dt.day - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -51,7 +51,7 @@ def test_date(scalars_dfs, col_name): bf_result = bf_series.dt.date.to_pandas() pd_result = scalars_pandas_df[col_name].dt.date - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -69,7 +69,7 @@ def test_dayofweek(scalars_dfs, col_name): bf_result = bf_series.dt.dayofweek.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofweek - assert_series_equal_ignoring_order(pd_result, bf_result, check_dtype=False) + assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( @@ -84,7 +84,7 @@ def test_hour(scalars_dfs, col_name): bf_result = bf_series.dt.hour.to_pandas() pd_result = scalars_pandas_df[col_name].dt.hour - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -102,7 +102,7 @@ def test_minute(scalars_dfs, col_name): bf_result = bf_series.dt.minute.to_pandas() pd_result = scalars_pandas_df[col_name].dt.minute - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -120,7 +120,7 @@ def test_month(scalars_dfs, col_name): bf_result = bf_series.dt.month.to_pandas() pd_result = scalars_pandas_df[col_name].dt.month - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -138,7 +138,7 @@ def test_quarter(scalars_dfs, col_name): bf_result = bf_series.dt.quarter.to_pandas() pd_result = scalars_pandas_df[col_name].dt.quarter - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -156,7 +156,7 @@ def test_second(scalars_dfs, col_name): bf_result = bf_series.dt.second.to_pandas() pd_result = scalars_pandas_df[col_name].dt.second - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -174,7 +174,7 @@ def test_time(scalars_dfs, col_name): bf_result = bf_series.dt.time.to_pandas() pd_result = scalars_pandas_df[col_name].dt.time - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -192,7 +192,7 @@ def test_year(scalars_dfs, col_name): bf_result = bf_series.dt.year.to_pandas() pd_result = scalars_pandas_df[col_name].dt.year - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 241cbd576b..27a35134d4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -19,7 +19,7 @@ import bigframes.series -from ...utils import assert_series_equal_ignoring_order +from ...utils import assert_series_equal def test_find(scalars_dfs): @@ -31,7 +31,7 @@ def test_find(scalars_dfs): # One of type mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -173,7 +173,7 @@ def test_len(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is `Int64` but # the `pd_result.dtype` is `float64`: https://github.com/pandas-dev/pandas/issues/51948 - assert_series_equal_ignoring_order( + assert_series_equal( pd_result.astype(pd.Int64Dtype()), bf_result, ) @@ -186,7 +186,7 @@ def test_lower(scalars_dfs): bf_result = bf_series.str.lower().to_pandas() pd_result = scalars_pandas_df[col_name].str.lower() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -205,7 +205,7 @@ def test_reverse(scalars_dfs): else: pd_result.loc[i] = cell[::-1] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -222,7 +222,7 @@ def test_slice(scalars_dfs, start, stop): pd_series = scalars_pandas_df[col_name] pd_result = pd_series.str.slice(start, stop) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -235,7 +235,7 @@ def test_strip(scalars_dfs): bf_result = bf_series.str.strip().to_pandas() pd_result = scalars_pandas_df[col_name].str.strip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -248,7 +248,7 @@ def test_upper(scalars_dfs): bf_result = bf_series.str.upper().to_pandas() pd_result = scalars_pandas_df[col_name].str.upper() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -330,7 +330,7 @@ def test_islower(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.islower() bf_result = weird_strings.str.islower().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -342,7 +342,7 @@ def test_isupper(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.isupper() bf_result = weird_strings.str.isupper().to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result.astype(pd.BooleanDtype()) # the dtype here is a case of intentional diversion from pandas @@ -357,7 +357,7 @@ def test_rstrip(scalars_dfs): bf_result = bf_series.str.rstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.rstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -370,7 +370,7 @@ def test_lstrip(scalars_dfs): bf_result = bf_series.str.lstrip().to_pandas() pd_result = scalars_pandas_df[col_name].str.lstrip() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -384,7 +384,7 @@ def test_repeat(scalars_dfs, repeats): bf_result = bf_series.str.repeat(repeats).to_pandas() pd_result = scalars_pandas_df[col_name].str.repeat(repeats) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -397,7 +397,7 @@ def test_capitalize(scalars_dfs): bf_result = bf_series.str.capitalize().to_pandas() pd_result = scalars_pandas_df[col_name].str.capitalize() - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -415,7 +415,7 @@ def test_cat_with_series(scalars_dfs): pd_right = scalars_pandas_df[col_name] pd_result = pd_left.str.cat(others=pd_right) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -429,7 +429,7 @@ def test_str_match(scalars_dfs): bf_result = bf_series.str.match(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.match(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -443,7 +443,7 @@ def test_str_fullmatch(scalars_dfs): bf_result = bf_series.str.fullmatch(pattern).to_pandas() pd_result = scalars_pandas_df[col_name].str.fullmatch(pattern) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -456,7 +456,7 @@ def test_str_get(scalars_dfs): bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -469,7 +469,7 @@ def test_str_pad(scalars_dfs): bf_result = bf_series.str.pad(8, side="both", fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.pad(8, side="both", fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -492,7 +492,7 @@ def test_str_ljust(scalars_dfs): bf_result = bf_series.str.ljust(7, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.ljust(7, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -505,7 +505,7 @@ def test_str_rjust(scalars_dfs): bf_result = bf_series.str.rjust(9, fillchar="%").to_pandas() pd_result = scalars_pandas_df[col_name].str.rjust(9, fillchar="%") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9494723ef7..3adda34c40 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -28,10 +28,7 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_df_construct_copy(scalars_dfs): @@ -98,7 +95,7 @@ def test_get_column(scalars_dfs): series = scalars_df[col_name] bf_result = series.to_pandas() pd_result = scalars_pandas_df[col_name] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_column_nonstring(scalars_dfs): @@ -106,7 +103,7 @@ def test_get_column_nonstring(scalars_dfs): series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] bf_result = series.to_pandas() pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_hasattr(scalars_dfs): @@ -183,7 +180,7 @@ def test_get_column_by_attr(scalars_dfs): series = scalars_df.int64_col bf_result = series.to_pandas() pd_result = scalars_pandas_df.int64_col - assert_series_equal_ignoring_order(bf_result, pd_result) + assert_series_equal(bf_result, pd_result) def test_get_columns(scalars_dfs): @@ -2279,6 +2276,13 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) @pytest.mark.parametrize( ("op"), [ @@ -2293,16 +2297,18 @@ def test_loc_setitem_bool_series_scalar_type_error(scalars_dfs): ], ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], ) -def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): +def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered): col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) pd_series = op(scalars_pandas_df_index[col_names]) - bf_result = bf_series.to_pandas() + bf_result = bf_series.to_pandas(ordered=ordered) # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_series = pd_series.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + assert_series_equal( + pd_series, bf_result, check_index_type=False, ignore_order=not ordered + ) @pytest.mark.parametrize( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5e494fbd21..d7578bc985 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,10 +24,7 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import ( - assert_pandas_df_equal, - assert_series_equal_ignoring_order, -) +from tests.system.utils import assert_pandas_df_equal, assert_series_equal def test_series_construct_copy(scalars_dfs): @@ -210,7 +207,7 @@ def test_abs(scalars_dfs, col_name): bf_result = scalars_df[col_name].abs().to_pandas() pd_result = scalars_pandas_df[col_name].abs() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_fillna(scalars_dfs): @@ -218,7 +215,7 @@ def test_fillna(scalars_dfs): col_name = "string_col" bf_result = scalars_df[col_name].fillna("Missing").to_pandas() pd_result = scalars_pandas_df[col_name].fillna("Missing") - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -465,7 +462,7 @@ def test_series_int_int_operators_scalar( bf_result = maybe_reversed_op(scalars_df["int64_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["int64_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar(scalars_dfs): @@ -474,7 +471,7 @@ def test_series_pow_scalar(scalars_dfs): bf_result = (scalars_df["int64_col"] ** 2).to_pandas() pd_result = scalars_pandas_df["int64_col"] ** 2 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_pow_scalar_reverse(scalars_dfs): @@ -483,7 +480,7 @@ def test_series_pow_scalar_reverse(scalars_dfs): bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() pd_result = 0.8 ** scalars_pandas_df["int64_col"] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -509,7 +506,7 @@ def test_series_bool_bool_operators_scalar( bf_result = maybe_reversed_op(scalars_df["bool_col"], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df["bool_col"], other_scalar) - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) @pytest.mark.parametrize( @@ -547,7 +544,7 @@ def test_series_int_int_operators_series(scalars_dfs, operator): scalars_df, scalars_pandas_df = scalars_dfs bf_result = operator(scalars_df["int64_col"], scalars_df["int64_too"]).to_pandas() pd_result = operator(scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -697,7 +694,7 @@ def test_series_add_scalar(scalars_dfs, other): bf_result = (scalars_df["float64_col"] + other).to_pandas() pd_result = scalars_pandas_df["float64_col"] + other - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -713,7 +710,7 @@ def test_series_add_bigframes_series(scalars_dfs, left_col, right_col): bf_result = (scalars_df[left_col] + scalars_df[right_col]).to_pandas() pd_result = scalars_pandas_df[left_col] + scalars_pandas_df[right_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -735,7 +732,7 @@ def test_series_add_bigframes_series_nested( scalars_pandas_df[left_col] + scalars_pandas_df[right_col] ) + scalars_pandas_df[righter_col] - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_series_add_different_table_default_index( @@ -893,7 +890,7 @@ def test_isnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_notnull(scalars_dfs): @@ -904,7 +901,7 @@ def test_notnull(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_series.astype(pd.BooleanDtype()), bf_series) + assert_series_equal(pd_series.astype(pd.BooleanDtype()), bf_series) def test_round(scalars_dfs): @@ -913,7 +910,7 @@ def test_round(scalars_dfs): bf_result = scalars_df[col_name].round().to_pandas() pd_result = scalars_pandas_df[col_name].round() - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_scalar(scalars_dfs): @@ -922,7 +919,7 @@ def test_eq_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_wider_type_scalar(scalars_dfs): @@ -931,7 +928,7 @@ def test_eq_wider_type_scalar(scalars_dfs): bf_result = scalars_df[col_name].eq(1.0).to_pandas() pd_result = scalars_pandas_df[col_name].eq(1.0) - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_ne_scalar(scalars_dfs): @@ -940,7 +937,7 @@ def test_ne_scalar(scalars_dfs): bf_result = (scalars_df[col_name] != 0).to_pandas() pd_result = scalars_pandas_df[col_name] != 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) def test_eq_int_scalar(scalars_dfs): @@ -949,7 +946,7 @@ def test_eq_int_scalar(scalars_dfs): bf_result = (scalars_df[col_name] == 0).to_pandas() pd_result = scalars_pandas_df[col_name] == 0 - assert_series_equal_ignoring_order(pd_result, bf_result) + assert_series_equal(pd_result, bf_result) @pytest.mark.parametrize( @@ -968,7 +965,7 @@ def test_eq_same_type_series(scalars_dfs, col_name): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_loc_setitem_cell(scalars_df_index, scalars_pandas_df_index): @@ -994,7 +991,7 @@ def test_ne_obj_series(scalars_dfs): # One of dtype mismatches to be documented. Here, the `bf_series.dtype` is `BooleanDtype` but # the `pd_series.dtype` is `bool`. - assert_series_equal_ignoring_order(pd_result.astype(pd.BooleanDtype()), bf_result) + assert_series_equal(pd_result.astype(pd.BooleanDtype()), bf_result) def test_indexing_using_unselected_series(scalars_dfs): @@ -1003,7 +1000,7 @@ def test_indexing_using_unselected_series(scalars_dfs): bf_result = scalars_df[col_name][scalars_df["int64_too"].eq(0)].to_pandas() pd_result = scalars_pandas_df[col_name][scalars_pandas_df["int64_too"].eq(0)] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1019,7 +1016,7 @@ def test_indexing_using_selected_series(scalars_dfs): scalars_pandas_df["string_col"].eq("Hello, World!") ] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1041,7 +1038,7 @@ def test_nested_filter(scalars_dfs): ) # Convert from nullable bool to nonnullable bool usable as indexer pd_result = pd_string_col[pd_int64_too == 0][~pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1060,7 +1057,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): bf_result = bf_series.to_pandas() pd_result = pd_series - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1082,7 +1079,7 @@ def test_binop_opposite_filters(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1100,7 +1097,7 @@ def test_binop_left_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col[pd_bool_col] + pd_float64_col - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1118,7 +1115,7 @@ def test_binop_right_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_float64_col + pd_int64_col[pd_bool_col] - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, ) @@ -1223,7 +1220,7 @@ def test_groupby_sum(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1241,7 +1238,7 @@ def test_groupby_std(scalars_dfs): .astype(pd.Float64Dtype()) ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1256,7 +1253,7 @@ def test_groupby_var(scalars_dfs): scalars_pandas_df[col_name].groupby(scalars_pandas_df["string_col"]).var() ) bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, check_exact=False, @@ -1308,7 +1305,7 @@ def test_groupby_mean(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1346,7 +1343,7 @@ def test_groupby_prod(scalars_dfs): ) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() - assert_series_equal_ignoring_order( + assert_series_equal( pd_series, bf_result, ) @@ -1556,7 +1553,7 @@ def test_head(scalars_dfs): bf_result = scalars_df["string_col"].head(2).to_pandas() pd_result = scalars_pandas_df["string_col"].head(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -1571,7 +1568,7 @@ def test_tail(scalars_dfs): bf_result = scalars_df["string_col"].tail(2).to_pandas() pd_result = scalars_pandas_df["string_col"].tail(2) - assert_series_equal_ignoring_order( + assert_series_equal( pd_result, bf_result, ) @@ -2039,7 +2036,7 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_series_equal_ignoring_order( + assert_series_equal( bf_result, pd_result, check_names=False, diff --git a/tests/system/utils.py b/tests/system/utils.py index c68acf10f3..f7831972b8 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -38,13 +38,16 @@ def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): pd.testing.assert_frame_equal(df0, df1, **kwargs) -def assert_series_equal_ignoring_order(left: pd.Series, right: pd.Series, **kwargs): - if left.index.name is None: - left = left.sort_values().reset_index(drop=True) - right = right.sort_values().reset_index(drop=True) - else: - left = left.sort_index() - right = right.sort_index() +def assert_series_equal( + left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs +): + if ignore_order: + if left.index.name is None: + left = left.sort_values().reset_index(drop=True) + right = right.sort_values().reset_index(drop=True) + else: + left = left.sort_index() + right = right.sort_index() pd.testing.assert_series_equal(left, right, **kwargs) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index e7026ebd87..f223bd416c 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._compile()._to_ibis_expr("unordered") + actual = expr._compile()._to_ibis_expr(ordering_mode="unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -154,7 +154,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ) expr = value.project_binary_op("col2", "col3", ops.add_op, "col4")._compile() assert expr.columns[3].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 assert actual.columns[3] == "col4" @@ -175,7 +175,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): "col2", "col3", "col4", ops.where_op, "col5" )._compile() assert expr.columns[4].type().is_float64() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 5 assert actual.columns[4] == "col5" @@ -196,7 +196,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): by_column_ids=["col1"], dropna=False, )._compile() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" @@ -215,7 +215,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): total_ordering_columns=["col1"], ) expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")])._compile() - actual = expr._to_ibis_expr("unordered") + actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" assert expr.columns[0].type().is_float64() From 8edabcf09533fca1b2431227ca6707f8de7eecb4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 28 Oct 2023 01:21:35 +0000 Subject: [PATCH 03/11] fix tests --- bigframes/dataframe.py | 4 +- bigframes/series.py | 4 + tests/system/small/ml/test_cluster.py | 2 +- tests/system/small/ml/test_core.py | 1 + tests/system/small/ml/test_decomposition.py | 2 + tests/system/small/test_dataframe.py | 104 +++++++++++++++----- tests/system/small/test_dataframe_io.py | 6 +- tests/system/small/test_pandas.py | 6 +- tests/system/small/test_series.py | 6 +- 9 files changed, 96 insertions(+), 39 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index db68033c51..bd5cb517b6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -879,8 +879,8 @@ def to_pandas( take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. ordered (bool, default True): - Determines whether the resulting dataframe will be sorted. In some cases, - unordered may result in a faster-executing query. + Determines whether the resulting pandas dataframe will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the diff --git a/bigframes/series.py b/bigframes/series.py index 5b22756d19..ed2868713b 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -286,6 +286,10 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas series will be ordered. In some cases, + unordered may result in a faster-executing query. + Returns: pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index a3af71892f..caeffa7768 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -68,7 +68,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): dtype="Int64", index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - assert_pandas_df_equal(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index cb6507e4e3..ec1f351d87 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -233,6 +233,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index b46b3d103d..cc4d2e5801 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -137,6 +137,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) @@ -156,4 +157,5 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): rtol=0.1, check_index_type=False, check_dtype=False, + ignore_order=True, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3adda34c40..fe8d7d917a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -113,15 +113,24 @@ def test_hasattr(scalars_dfs): assert not hasattr(scalars_df, "not_exist") -def test_head_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): rename_mapping = { "int64_col": "Integer Column", "string_col": "言語列", } bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.to_pandas() + bf_result = bf_df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): @@ -564,14 +573,21 @@ def test_assign_existing_column(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) -def test_assign_series(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.to_pandas() + bf_result = df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_assign_series_overwrite(scalars_dfs): @@ -849,7 +865,9 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -882,7 +900,9 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -912,7 +932,9 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) @pytest.mark.parametrize( @@ -945,7 +967,9 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) def test_get_dtypes(scalars_df_default_index): @@ -1605,8 +1629,15 @@ def test_binop_df_df_binary_op( # Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) def test_series_binop_add_different_table( - scalars_df_index, scalars_pandas_df_index, scalars_df_2_index + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered ): df_columns = ["int64_col", "float64_col"] series_column = "int64_too" @@ -1614,13 +1645,13 @@ def test_series_binop_add_different_table( bf_result = ( scalars_df_index[df_columns] .add(scalars_df_2_index[series_column], axis="index") - .to_pandas() + .to_pandas(ordered=ordered) ) pd_result = scalars_pandas_df_index[df_columns].add( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) # TODO(garrettwu): Test series binop with different index @@ -1899,7 +1930,14 @@ def test_df_describe(scalars_dfs): ).all() -def test_df_stack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_stack(scalars_dfs, ordered): if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): pytest.skip("pandas <2.1 uses different stack implementation") scalars_df, scalars_pandas_df = scalars_dfs @@ -1909,14 +1947,23 @@ def test_df_stack(scalars_dfs): # Can only stack identically-typed columns columns = ["int64_col", "int64_too", "rowindex_2"] - bf_result = scalars_df[columns].stack().to_pandas() + bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].stack(future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) -def test_df_unstack(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes scalars_pandas_df = scalars_pandas_df.copy() @@ -1929,11 +1976,13 @@ def test_df_unstack(scalars_dfs): ] # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas() + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].unstack() # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) @pytest.mark.parametrize( @@ -2078,14 +2127,18 @@ def test_iloc_slice_zero_step(scalars_df_index): scalars_df_index.iloc[0:0:0] -def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas() +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) @pytest.mark.parametrize( @@ -2562,6 +2615,7 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): assert_pandas_df_equal( bf_result, pd_result, + ignore_order=True, check_names=False, ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index a235845937..3600dda56d 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -377,7 +377,7 @@ def test_to_sql_query_unnamed_index_included( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_included( @@ -409,7 +409,7 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_excluded( @@ -426,4 +426,4 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) + assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 56bea42ad5..17ba905c9f 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -140,7 +140,7 @@ def test_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -174,7 +174,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -208,7 +208,7 @@ def test_merge_series(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) def test_cut(scalars_dfs): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d7578bc985..dc6669c695 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2036,11 +2036,7 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_series_equal( - bf_result, - pd_result, - check_names=False, - ) + assert_series_equal(bf_result, pd_result, check_names=False, ignore_order=True) def test_series_filter_like(scalars_df_index, scalars_pandas_df_index): From 449ca103bbde0fdfc85fb603083a1ba2e075fcba Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 28 Oct 2023 01:21:35 +0000 Subject: [PATCH 04/11] fix tests --- tests/system/small/test_dataframe_io.py | 8 ++++++-- tests/system/small/test_remote_function.py | 4 +++- tests/system/small/test_series.py | 14 ++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 3600dda56d..d700d93be9 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -409,7 +409,9 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) def test_to_sql_query_named_index_excluded( @@ -426,4 +428,6 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) + assert_pandas_df_equal( + roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 6cbe7eee21..3d8532a13b 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -583,7 +583,9 @@ def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) indirect_df = indirect_df.to_pandas() - assert_pandas_df_equal(direct_df, indirect_df) + assert_pandas_df_equal( + direct_df, indirect_df, ignore_order=True, check_index_type=False + ) @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index dc6669c695..e0f4416c73 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1079,10 +1079,9 @@ def test_binop_opposite_filters(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col1[pd_bool_col] + pd_int64_col2[pd_bool_col.__invert__()] - assert_series_equal( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_left_filtered(scalars_dfs): @@ -1097,10 +1096,9 @@ def test_binop_left_filtered(scalars_dfs): pd_bool_col = scalars_pandas_df["bool_col"] pd_result = pd_int64_col[pd_bool_col] + pd_float64_col - assert_series_equal( - bf_result, - pd_result, - ) + # Passes with ignore_order=False only with some dependency sets + # TODO: Determine desired behavior and make test more strict + assert_series_equal(bf_result, pd_result, ignore_order=True) def test_binop_right_filtered(scalars_dfs): From 7b382804bd02a2fc48b044d124776931e4838dcb Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 28 Oct 2023 22:45:12 +0000 Subject: [PATCH 05/11] more tests, prettier sql --- bigframes/core/compile/compiled.py | 10 ++++---- bigframes/series.py | 4 +-- tests/system/large/ml/test_cluster.py | 2 +- tests/system/small/test_groupby.py | 35 ++++++++++++++++++++------- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 0feecae5b6..681a841e1c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -1277,11 +1277,11 @@ def to_sql( ) if sorted: sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ + f"SELECT * EXCEPT (`{offsets_id}`)\n" + "FROM (\n" + f"{sql}\n" + ")\n" + f"ORDER BY `{offsets_id}`\n" ) return typing.cast(str, sql) diff --git a/bigframes/series.py b/bigframes/series.py index ed2868713b..52df00ef87 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -287,8 +287,8 @@ def to_pandas( take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. ordered (bool, default True): - Determines whether the resulting pandas series will be ordered. In some cases, - unordered may result in a faster-executing query. + Determines whether the resulting pandas series will be deterministically ordered. + In some cases, unordered may result in a faster-executing query. Returns: diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index cef167d4ac..f01116665f 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -105,7 +105,7 @@ def test_cluster_configure_fit_score_predict( index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) expected.index.name = "observation" - assert_pandas_df_equal(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) # save, load, check n_clusters to ensure configuration was kept reloaded_model = model.to_gbq( diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 05154f7ab7..a24713c2b3 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -16,6 +16,7 @@ import pytest import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal @pytest.mark.parametrize( @@ -88,16 +89,23 @@ def test_dataframe_groupby_aggregate( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_dataframe_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_agg_string( + scalars_df_index, scalars_pandas_df_index, ordered +): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").agg("count") pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count") - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal( - pd_result, - bf_result_computed, - check_dtype=False, + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -270,13 +278,22 @@ def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) -def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, ordered): col_names = ["float64_col", "int64_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1) pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) - bf_result_computed = bf_result.to_pandas() + bf_result_computed = bf_result.to_pandas(ordered=ordered) - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + assert_pandas_df_equal( + pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered + ) def test_dataframe_groupby_getitem( From 1ab8e30a3af067c8c8360872dfa5c88134b73baa Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 6 Nov 2023 19:16:02 +0000 Subject: [PATCH 06/11] simplify ir classes --- bigframes/core/__init__.py | 4 +- bigframes/core/compile/__init__.py | 8 +- bigframes/core/compile/compiled.py | 532 ++++++++--------------------- bigframes/core/compile/compiler.py | 23 +- 4 files changed, 173 insertions(+), 394 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index d36a50ff37..1c291a69c2 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -99,7 +99,9 @@ def _compile_unordered(self) -> compiled.UnorderedIR: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" width = len(self._compile().columns) - count_expr = self._compile()._to_ibis_expr(ordering_mode="unordered").count() + count_expr = ( + self._compile_unordered()._to_ibis_expr(ordering_mode="unordered").count() + ) # Support in-memory engines for hermetic unit tests. if not self.node.session: diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index af3f32aefb..761fd9a465 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.compile.compiled import CompiledArrayValue -from bigframes.core.compile.compiler import compile_ordered +from bigframes.core.compile.compiled import OrderedIR, UnorderedIR +from bigframes.core.compile.compiler import compile_ordered, compile_unordered __all__ = [ "compile_ordered", - "CompiledArrayValue", + "compile_unordered", + "OrderedIR", + "UnorderedIR", ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 681a841e1c..6fad1a7645 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import abc import functools import textwrap import typing @@ -41,108 +42,63 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" +T = typing.TypeVar("T", bound="BaseIbisIR") + + +class BaseIbisIR(abc.ABC): + """Implementation detail, contains common logic between ordered and unordered IR""" + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} -class CompiledArrayValue(typing.Protocol): @property - def column_ids(self) -> typing.Sequence[str]: - ... + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns - def to_sql(self) -> str: - ... + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) - def _to_ibis_expr(self, *args, **kwargs) -> str: - """Exposed for testing purposes only.""" - ... + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) - def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + @abc.abstractmethod + def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: + """Creates a new expression based on this expression with new columns.""" ... - def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: + def drop_columns(self: T, columns: Iterable[str]) -> T: return self.select_columns( [col for col in self.column_ids if col not in columns] ) - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ... - - def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + @abc.abstractmethod + def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" ... - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> CompiledArrayValue: - ... - - def reversed(self) -> CompiledArrayValue: - ... - - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" - ... - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - ... - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> CompiledArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - ... - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> CompiledArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - ... - - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> CompiledArrayValue: - """ - Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. - This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. - Arguments: - corr_aggregations: left_column_id, right_column_id, output_column_id tuples - """ - ... - - def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: - ... - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> CompiledArrayValue: - ... - + @abc.abstractmethod def unpivot( - self, + self: T, row_labels: typing.Sequence[typing.Hashable], unpivot_columns: typing.Sequence[ typing.Tuple[str, typing.Sequence[typing.Optional[str]]] @@ -154,7 +110,7 @@ def unpivot( bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] ] = pandas.Float64Dtype(), how="left", - ) -> CompiledArrayValue: + ) -> T: """ Unpivot ArrayValue columns. @@ -170,7 +126,8 @@ def unpivot( """ ... - def _reproject_to_table(self) -> CompiledArrayValue: + @abc.abstractmethod + def _reproject_to_table(self: T) -> T: """ Internal operators that projects the internal representation into a new ibis table expression where each value column is a direct @@ -180,78 +137,66 @@ def _reproject_to_table(self) -> CompiledArrayValue: """ ... - def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - ... - - # Always ordered operations - def project_window_op( - self, - column_name: str, - op: agg_ops.WindowOp, - window_spec: WindowSpec, - output_name=None, - *, - never_skip_nulls=False, - skip_reproject_unsafe: bool = False, - ) -> OrderedIR: - """ - Creates a new expression based on this expression with unary operation applied to one column. - column_name: the id of the input column present in the expression - op: the windowable operator to apply to the input column - window_spec: a specification of the window over which to apply the operator - output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided - never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection - """ - ... - - def promote_offsets(self, col_id: str): - """ - Convenience function to promote copy of column offsets to a value column. Can be used to reset index. - """ - ... - + def project_unary_op( + self: T, column_name: str, op: ops.UnaryOp, output_name=None + ) -> T: + """Creates a new expression based on this expression with unary operation applied to one column.""" + value = op._as_ibis(self._get_ibis_column(column_name)).name(output_name) + return self._set_or_replace_by_id(output_name, value) -class BaseIbisIR: - """Implementation detail, contains common logic between ordered and unordered IR""" + def project_binary_op( + self: T, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) - def __init__( - self, - table: ibis_types.Table, - columns: Sequence[ibis_types.Value], - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - # To allow for more efficient lookup by column name, create a - # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + def project_ternary_op( + self: T, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> T: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns + def assign(self: T, source_id: str, destination_id: str) -> T: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) + def assign_constant( + self: T, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> T: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None - ) + @abc.abstractmethod + def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: + ... def _get_ibis_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column.""" @@ -272,7 +217,7 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: # Ibis Implementations -class UnorderedIR(BaseIbisIR, CompiledArrayValue): +class UnorderedIR(BaseIbisIR): def __init__( self, table: ibis_types.Table, @@ -281,58 +226,6 @@ def __init__( ): super().__init__(table, columns, predicates) - @classmethod - def from_pandas( - cls, - pd_df: pandas.DataFrame, - ) -> UnorderedIR: - """ - Builds an in-memory only (SQL only) expr from a pandas dataframe. - """ - # We can't include any hidden columns in the ArrayValue constructor, so - # grab the column names before we add the hidden ordering column. - column_names = [str(column) for column in pd_df.columns] - # Make sure column names are all strings. - pd_df = pd_df.set_axis(column_names, axis="columns") - - # ibis memtable cannot handle NA, must convert to None - pd_df = pd_df.astype("object") # type: ignore - pd_df = pd_df.where(pandas.notnull(pd_df), None) - - # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. - keys_memtable = ibis.memtable(pd_df) - schema = keys_memtable.schema() - new_schema = [] - for column_index, column in enumerate(schema): - column_type = schema[column] - # The autodetected type might not be one we can support, such - # as NULL type for empty rows, so convert to a type we do - # support. - new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) - ) - # TODO(swast): Ibis memtable doesn't use backticks in struct - # field names, so spaces and other characters aren't allowed in - # the memtable context. Blocked by - # https://github.com/ibis-project/ibis/issues/7187 - column = f"col_{column_index}" - new_schema.append((column, new_type)) - - # must set non-null column labels. these are not the user-facing labels - pd_df = pd_df.set_axis( - [column for column, _ in new_schema], - axis="columns", - ) - keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) - - return cls( - keys_memtable, - columns=[ - keys_memtable[f"col_{column_index}"].name(column) - for column_index, column in enumerate(column_names) - ], - ) - def builder(self): """Creates a mutable builder for expressions.""" # Since ArrayValue is intended to be immutable (immutability offers @@ -432,7 +325,7 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: new_expr = builder.build() return new_expr - def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) ) @@ -445,76 +338,12 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayVal ) return self._filter(condition) - def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: + def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" expr = self.builder() expr.predicates = [*self._predicates, predicate_value] return expr.build() - def order_by( - self, by: Sequence[OrderingColumnReference], stable: bool = False - ) -> UnorderedIR: - return self - - def reversed(self) -> UnorderedIR: - return self - - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> UnorderedIR: - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name - ) - return self._set_or_replace_by_id(output_name or column_name, value) - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> UnorderedIR: - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> UnorderedIR: - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def assign(self, source_id: str, destination_id: str) -> UnorderedIR: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> UnorderedIR: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - def unpivot( self, row_labels: typing.Sequence[typing.Hashable], @@ -628,6 +457,13 @@ def aggregate( by_column_ids: typing.Sequence[str] = (), dropna: bool = True, ) -> OrderedIR: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ table = self._to_ibis_expr() stats = { col_out: agg_op._as_ibis(table[col_in]) @@ -675,6 +511,12 @@ def aggregate( def corr_aggregate( self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] ) -> OrderedIR: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ table = self._to_ibis_expr() stats = { col_out: table[col_left].corr(table[col_right], how="pop") @@ -708,27 +550,10 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) - # Unsupported operations, need ordering - def project_window_op( - self, - column_name: str, - op: agg_ops.WindowOp, - window_spec: WindowSpec, - output_name=None, - *, - never_skip_nulls=False, - skip_reproject_unsafe: bool = False, - ) -> OrderedIR: - raise ValueError("Window ops must be compiled in ordered mode") - - def promote_offsets(self, col_id: str): - raise ValueError("Window ops must be compiled in ordered mode") - ## Helpers def _set_or_replace_by_id( self, id: str, new_value: ibis_types.Value ) -> UnorderedIR: - """Safely assign by id while maintaining ordering integrity.""" builder = self.builder() if id in self.column_ids: builder.columns = [ @@ -773,7 +598,7 @@ def build(self) -> UnorderedIR: ) -class OrderedIR(BaseIbisIR, CompiledArrayValue): +class OrderedIR(BaseIbisIR): """Immutable BigQuery DataFrames expression tree. Note: Usage of this class is considered to be private and subject to change @@ -935,62 +760,6 @@ def reversed(self) -> OrderedIR: expr_builder.ordering = self._ordering.with_reverse() return expr_builder.build() - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> OrderedIR: - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name - ) - return self._set_or_replace_by_id(output_name or column_name, value) - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> OrderedIR: - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> OrderedIR: - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def assign(self, source_id: str, destination_id: str) -> OrderedIR: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> OrderedIR: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - def _uniform_sampling(self, fraction: float) -> OrderedIR: """Sampling the table on given fraction. @@ -1012,6 +781,9 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ) def promote_offsets(self, col_id: str) -> OrderedIR: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ # Special case: offsets already exist ordering = self._ordering @@ -1038,19 +810,6 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: new_expr = builder.build() return new_expr - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> OrderedIR: - return self.to_unordered().aggregate(aggregations, by_column_ids, dropna) - - def corr_aggregate( - self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] - ) -> OrderedIR: - return self.to_unordered().corr_aggregate(corr_aggregations) - ## Methods that only work with ordering def project_window_op( self, @@ -1062,6 +821,15 @@ def project_window_op( never_skip_nulls=False, skip_reproject_unsafe: bool = False, ) -> OrderedIR: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) @@ -1373,26 +1141,7 @@ def _to_ibis_expr( table = table.filter(ibis.random() < ibis.literal(fraction)) return table - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + def filter(self, predicate_id: str, keep_null: bool = False) -> OrderedIR: condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) ) @@ -1412,6 +1161,25 @@ def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR: expr.predicates = [*self._predicates, predicate_value] return expr.build() + def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + ## Ordering specific helpers def _get_any_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column. Will also get hidden columns.""" diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 5959695b30..7202042869 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -92,10 +92,11 @@ def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd) if ordered: - return compiled.OrderedIR.from_pandas(array_as_pd) + return ordered_ir else: - return compiled.UnorderedIR.from_pandas(array_as_pd) + ordered_ir.to_unordered() @_compile_node.register @@ -116,7 +117,7 @@ def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): @_compile_node.register def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): - result = compile_node(node.child, True).promote_offsets(node.col_id) + result = compile_ordered(node.child).promote_offsets(node.col_id) return result if ordered else result.to_unordered() @@ -127,12 +128,18 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True): @_compile_node.register def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): - return compile_node(node.child, ordered).order_by(node.by, node.stable) + if ordered: + return compile_ordered(node.child).order_by(node.by, node.stable) + else: + return compile_unordered(node.child) @_compile_node.register def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): - return compile_node(node.child, ordered).reversed() + if ordered: + return compile_ordered(node.child).reversed() + else: + return compile_unordered(node.child) @_compile_node.register @@ -168,7 +175,7 @@ def compile_concat(node: nodes.ConcatNode, ordered: bool = True): @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - result = compile_node(node.child, False).aggregate( + result = compile_unordered(node.child).aggregate( node.aggregations, node.by_column_ids, node.dropna ) return result if ordered else result.to_unordered() @@ -176,13 +183,13 @@ def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): @_compile_node.register def compile_corr(node: nodes.CorrNode, ordered: bool = True): - result = compile_node(node.child, False).corr_aggregate(node.corr_aggregations) + result = compile_unordered(node.child).corr_aggregate(node.corr_aggregations) return result if ordered else result.to_unordered() @_compile_node.register def compile_window(node: nodes.WindowOpNode, ordered: bool = True): - result = compile_node(node.child, True).project_window_op( + result = compile_ordered(node.child).project_window_op( node.column_name, node.op, node.window_spec, From 57f8bc23570ea190cda6a4673d1c191dc67d3f52 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 7 Nov 2023 01:42:38 +0000 Subject: [PATCH 07/11] fix missing return statement --- bigframes/core/compile/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 7202042869..feba392305 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -96,7 +96,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): if ordered: return ordered_ir else: - ordered_ir.to_unordered() + return ordered_ir.to_unordered() @_compile_node.register From 43430fc3920187c1459480430247d5b6627fd3cd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 7 Nov 2023 01:57:27 +0000 Subject: [PATCH 08/11] fix unary op bug --- bigframes/core/__init__.py | 4 +--- bigframes/core/compile/compiled.py | 14 +++++++++----- bigframes/core/compile/compiler.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 1c291a69c2..931174cb56 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -99,9 +99,7 @@ def _compile_unordered(self) -> compiled.UnorderedIR: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" width = len(self._compile().columns) - count_expr = ( - self._compile_unordered()._to_ibis_expr(ordering_mode="unordered").count() - ) + count_expr = self._compile_unordered()._to_ibis_expr().count() # Support in-memory engines for hermetic unit tests. if not self.node.session: diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6fad1a7645..4ba5e6bd08 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -138,11 +138,17 @@ def _reproject_to_table(self: T) -> T: ... def project_unary_op( - self: T, column_name: str, op: ops.UnaryOp, output_name=None + self: T, + input_column_id: str, + op: ops.UnaryOp, + output_column_id: typing.Optional[str] = None, ) -> T: """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name(output_name) - return self._set_or_replace_by_id(output_name, value) + result_id = ( + output_column_id or input_column_id + ) # overwrite input if not output id provided + value = op._as_ibis(self._get_ibis_column(input_column_id)).name(result_id) + return self._set_or_replace_by_id(result_id, value) def project_binary_op( self: T, @@ -258,7 +264,6 @@ def _to_ibis_expr( expose_hidden_cols: bool = False, fraction: Optional[float] = None, col_id_overrides: typing.Mapping[str, str] = {}, - **kwargs, ): """ Creates an Ibis table expression representing the DataFrame. @@ -1061,7 +1066,6 @@ def _to_ibis_expr( col_id_overrides: typing.Mapping[str, str] = {}, ordering_mode: Literal["string_encoded", "offset_col", "unordered"], order_col_name: Optional[str] = ORDER_ID_COLUMN, - **kwargs, ): """ Creates an Ibis table expression representing the DataFrame. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index feba392305..662e73a433 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -50,7 +50,7 @@ def _compile_node( node: nodes.BigFrameNode, ordered: bool = True ) -> compiled.UnorderedIR: """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unnrecognized node: {node}") + raise ValueError(f"Can't compile unrecognized node: {node}") @_compile_node.register From 5fc62fce8f764e5d6ad93ac3b1df68bba6df0d40 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 7 Nov 2023 02:40:34 +0000 Subject: [PATCH 09/11] fix flaky test, add more unordered tests --- tests/system/large/ml/test_pipeline.py | 2 +- tests/system/small/test_dataframe.py | 15 ++++++++++++--- tests/system/small/test_pandas.py | 13 ++++++++++--- tests/system/small/test_series.py | 16 ++++++++++------ 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 3197320047..3e56954058 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -555,7 +555,7 @@ def test_pipeline_standard_scaler_kmeans_fit_score_predict( ), ) expected.index.name = "observation" - assert_pandas_df_equal(result, expected) + assert_pandas_df_equal(result, expected, ignore_order=True) def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_index): diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 269f1bfc65..e5dae4b250 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2560,16 +2560,25 @@ def test_df_skew_too_few_values(scalars_dfs): pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) -def test_df_skew(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): columns = ["float64_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].skew().to_pandas() + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) pd_result = scalars_pandas_df[columns].skew() # Pandas may produce narrower numeric types, but bigframes always produces Float64 pd_result = pd_result.astype("Float64") - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) def test_df_kurt_too_few_values(scalars_dfs): diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index eafe5e00b5..8795a67e2a 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -19,13 +19,20 @@ from tests.system.utils import assert_pandas_df_equal -def test_concat_dataframe(scalars_dfs): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_concat_dataframe(scalars_dfs, ordered): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.concat(11 * [scalars_df]) - bf_result = bf_result.to_pandas() + bf_result = bf_result.to_pandas(ordered=ordered) pd_result = pd.concat(11 * [scalars_pandas_df]) - pd.testing.assert_frame_equal(bf_result, pd_result) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) def test_concat_series(scalars_dfs): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 96293fb7bb..66d1f02a45 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2188,21 +2188,25 @@ def test_where_with_default(scalars_df_index, scalars_pandas_df_index): ) -def test_clip(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_clip(scalars_df_index, scalars_pandas_df_index, ordered): col_bf = scalars_df_index["int64_col"] lower_bf = scalars_df_index["int64_too"] - 1 upper_bf = scalars_df_index["int64_too"] + 1 - bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas() + bf_result = col_bf.clip(lower_bf, upper_bf).to_pandas(ordered=ordered) col_pd = scalars_pandas_df_index["int64_col"] lower_pd = scalars_pandas_df_index["int64_too"] - 1 upper_pd = scalars_pandas_df_index["int64_too"] + 1 pd_result = col_pd.clip(lower_pd, upper_pd) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) + assert_series_equal(bf_result, pd_result, ignore_order=not ordered) def test_clip_filtered_two_sided(scalars_df_index, scalars_pandas_df_index): From 8b7afb5d60747d4c53a6faaee5247c0b0e1e3be0 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 Nov 2023 22:46:55 +0000 Subject: [PATCH 10/11] rename _compile to _compile_unordered for arrayvalue --- bigframes/core/__init__.py | 12 ++++++------ tests/unit/test_core.py | 24 ++++++++++++++---------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index dd5ee5351b..63f36d4ddd 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -79,7 +79,7 @@ def from_pandas(cls, pd_df: pandas.DataFrame): @property def column_ids(self) -> typing.Sequence[str]: - return self._compile().column_ids + return self._compile_ordered().column_ids @property def session(self) -> Session: @@ -89,9 +89,9 @@ def session(self) -> Session: return self.node.session[0] if required_session else get_global_session() def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self._compile().get_column_type(key) + return self._compile_ordered().get_column_type(key) - def _compile(self) -> compiled.OrderedIR: + def _compile_ordered(self) -> compiled.OrderedIR: return compiler.compile_ordered(self.node) def _compile_unordered(self) -> compiled.UnorderedIR: @@ -99,7 +99,7 @@ def _compile_unordered(self) -> compiled.UnorderedIR: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - width = len(self._compile().columns) + width = len(self._compile_unordered().columns) count_expr = self._compile_unordered()._to_ibis_expr().count() # Support in-memory engines for hermetic unit tests. @@ -126,7 +126,7 @@ def to_sql( sorted: bool = False, ) -> str: if sorted or offset_column: - return self._compile().to_sql( + return self._compile_ordered().to_sql( offset_column=offset_column, col_id_overrides=col_id_overrides, sorted=sorted, @@ -161,7 +161,7 @@ def start_query( def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - compiled_value = self._compile() + compiled_value = self._compile_ordered() ibis_expr = compiled_value._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index f223bd416c..623448b3aa 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -49,7 +49,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ordering=ordering, hidden_ordering_columns=(), ) - assert actual._compile()._table is ibis_table + assert actual._compile_ordered()._table is ibis_table assert len(actual.column_ids) == 3 @@ -83,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value._compile()._get_ibis_column("col1") + col1 = value._compile_ordered()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -100,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value._compile()._get_ibis_column("col1") + expr = value._compile_ordered()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -117,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._compile()._to_ibis_expr(ordering_mode="unordered") + actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -136,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile() - assert value._compile().columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered() + assert value._compile_ordered().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,7 +152,9 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4")._compile() + expr = value.project_binary_op( + "col2", "col3", ops.add_op, "col4" + )._compile_ordered() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 @@ -173,7 +175,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op( "col2", "col3", "col4", ops.where_op, "col5" - )._compile() + )._compile_ordered() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 5 @@ -195,7 +197,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - )._compile() + )._compile_ordered() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" @@ -214,7 +216,9 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")])._compile() + expr = value.corr_aggregate( + corr_aggregations=[("col1", "col3", "col4")] + )._compile_ordered() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" From 146b74cec1583c12e0761d478bfb882900c117f5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 Nov 2023 23:05:42 +0000 Subject: [PATCH 11/11] fix mypy issue on join how arg --- bigframes/core/compile/single_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index c193624e6d..a9088feb49 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -212,7 +212,7 @@ def join_by_column_unordered( left_table, right_table, predicates=join_conditions, - how=how, + how=how, # type: ignore ) # We could filter out the original join columns, but predicates/ordering # might still reference them in implicit joins.