From 8b7b26b9054bfc9c0c47d7886d88640837272c88 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 3 Oct 2023 00:44:51 +0000 Subject: [PATCH 1/6] refactor: simplify ArrayValue public interface --- bigframes/core/__init__.py | 218 ++++++++++++-------------- bigframes/core/blocks.py | 28 +--- bigframes/core/groupby/__init__.py | 4 - bigframes/core/indexes/index.py | 4 +- bigframes/core/joins/row_identity.py | 16 +- bigframes/core/joins/single_column.py | 20 +-- bigframes/dataframe.py | 6 +- bigframes/operations/base.py | 6 - tests/unit/test_core.py | 32 +--- 9 files changed, 130 insertions(+), 204 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 5e0675fd13..d94b60c866 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -16,8 +16,9 @@ from dataclasses import dataclass import functools import math +import textwrap import typing -from typing import Collection, Dict, Iterable, Literal, Optional, Sequence, Tuple +from typing import Collection, Iterable, Literal, Optional, Sequence, Tuple from google.cloud import bigquery import ibis @@ -201,31 +202,27 @@ def mem_expr_from_pandas( hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), ) - @property - def table(self) -> ibis_types.Table: - return self._table - - @property - def reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None - ) - @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: return self._columns @property - def column_names(self) -> Dict[str, ibis_types.Value]: - return self._column_names + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) @property def hidden_ordering_columns(self) -> typing.Tuple[ibis_types.Value, ...]: return self._hidden_ordering_columns + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + @property def _ibis_order(self) -> Sequence[ibis_types.Value]: """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" @@ -265,24 +262,22 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue: def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self.get_any_column(key).type() + bigframes.dtypes.IbisDtype, self._get_any_column(key).type() ) return typing.cast( bigframes.dtypes.Dtype, bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), ) - def get_column(self, key: str) -> ibis_types.Value: + def _get_ibis_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column.""" - if key not in self._column_names.keys(): + if key not in self.column_ids: raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._column_names.keys() - ) + "Column name {} not in set of values: {}".format(key, self.column_ids) ) return typing.cast(ibis_types.Value, self._column_names[key]) - def get_any_column(self, key: str) -> ibis_types.Value: + def _get_any_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column. Will also get hidden columns.""" all_columns = {**self._column_names, **self._hidden_ordering_column_names} if key not in all_columns.keys(): @@ -303,26 +298,11 @@ def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: ) return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) - def apply_limit(self, max_results: int) -> ArrayValue: - table = self._to_ibis_expr( - ordering_mode="order_by", - expose_hidden_cols=True, - ).limit(max_results) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] - return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast(ibis_types.BooleanValue, self.get_column(predicate_id)) + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) if keep_null: condition = typing.cast( ibis_types.BooleanValue, @@ -357,9 +337,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: .. warning:: The row numbers of result is non-deterministic, avoid to use. """ - table = self._to_ibis_expr( - ordering_mode="order_by", expose_hidden_cols=True, fraction=fraction - ) + table = self._to_ibis_expr(expose_hidden_cols=True, fraction=fraction) columns = [table[column_name] for column_name in self._column_names] hidden_ordering_columns = [ table[column_name] for column_name in self._hidden_ordering_column_names @@ -373,7 +351,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: ) @property - def offsets(self): + def _offsets(self) -> ibis_types.Value: if not self._ordering.is_sequential: raise ValueError( "Expression does not have offsets. Generate them first using project_offsets." @@ -382,9 +360,9 @@ def offsets(self): raise ValueError( "Ordering is invalid. Marked as sequential but no total order columns." ) - return self.get_any_column(self._ordering.total_order_col.column_id) + return self._get_any_column(self._ordering.total_order_col.column_id) - def project_offsets(self) -> ArrayValue: + def _project_offsets(self) -> ArrayValue: """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" if self._ordering.is_sequential: return self @@ -414,7 +392,7 @@ def _hide_column(self, column_id) -> ArrayValue: new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") expr_builder.hidden_ordering_columns = [ *self._hidden_ordering_columns, - self.get_column(column_id).name(new_name), + self._get_ibis_column(column_id).name(new_name), ] expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) return expr_builder.build() @@ -427,26 +405,28 @@ def promote_offsets(self) -> typing.Tuple[ArrayValue, str]: ordering = self._ordering if (not ordering.is_sequential) or (not ordering.total_order_col): - return self.project_offsets().promote_offsets() + return self._project_offsets().promote_offsets() col_id = bigframes.core.guid.generate_guid() expr_builder = self.builder() expr_builder.columns = [ - self.get_any_column(ordering.total_order_col.column_id).name(col_id), + self._get_any_column(ordering.total_order_col.column_id).name(col_id), *self.columns, ] return expr_builder.build(), col_id def select_columns(self, column_ids: typing.Sequence[str]): - return self.projection([self.get_column(col_id) for col_id in column_ids]) + return self._projection( + [self._get_ibis_column(col_id) for col_id in column_ids] + ) - def projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: + def _projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: """Creates a new expression based on this expression with new columns.""" # TODO(swast): We might want to do validation here that columns derive # from the same table expression instead of (in addition to?) at # construction time. expr = self - for ordering_column in set(self.column_names.keys()).intersection( + for ordering_column in set(self.column_ids).intersection( [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] ): # Need to hide ordering columns that are being dropped. Alternatively, could project offsets @@ -459,7 +439,7 @@ def projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" width = len(self.columns) - count_expr = self._to_ibis_expr(ordering_mode="unordered").count() + count_expr = self._to_ibis_expr().count() sql = self._session.ibis_client.compile(count_expr) # Support in-memory engines for hermetic unit tests. @@ -527,7 +507,7 @@ def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self.get_column(column_name)).name( + value = op._as_ibis(self._get_ibis_column(column_name)).name( output_name or column_name ) return self._set_or_replace_by_id(output_name or column_name, value) @@ -541,7 +521,8 @@ def project_binary_op( ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" value = op( - self.get_column(left_column_id), self.get_column(right_column_id) + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), ).name(output_column_id) return self._set_or_replace_by_id(output_column_id, value) @@ -555,9 +536,9 @@ def project_ternary_op( ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" value = op( - self.get_column(col_id_1), - self.get_column(col_id_2), - self.get_column(col_id_3), + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), ).name(output_column_id) return self._set_or_replace_by_id(output_column_id, value) @@ -574,7 +555,7 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self._to_ibis_expr(ordering_mode="unordered") + table = self._to_ibis_expr() stats = { col_out: agg_op._as_ibis(table[col_in]) for col_in, agg_op, col_out in aggregations @@ -594,10 +575,10 @@ def aggregate( if dropna: for column_id in by_column_ids: expr = expr._filter( - ops.notnull_op._as_ibis(expr.get_column(column_id)) + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) ) # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr.project_offsets() + return expr._project_offsets() else: aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} result = table.aggregate(**aggregates) @@ -624,7 +605,7 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self._to_ibis_expr(ordering_mode="unordered") + table = self._to_ibis_expr() stats = { col_out: table[col_left].corr(table[col_right], how="pop") for col_left, col_right, col_out in corr_aggregations @@ -664,7 +645,7 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ - column = typing.cast(ibis_types.Column, self.get_column(column_name)) + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) window_op = op._as_ibis(column, window) @@ -700,26 +681,36 @@ def project_window_op( def to_sql( self, - ordering_mode: Literal[ - "order_by", "string_encoded", "offset_col", "unordered" - ] = "order_by", - order_col_name: Optional[str] = ORDER_ID_COLUMN, + offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, ) -> str: + offsets_id = offset_column or ORDER_ID_COLUMN + sql = self._session.ibis_client.compile( self._to_ibis_expr( - ordering_mode=ordering_mode, - order_col_name=order_col_name, + ordering_mode="offset_col" + if (offset_column or sorted) + else "unordered", + order_col_name=offsets_id, col_id_overrides=col_id_overrides, ) ) + if sorted: + sql = textwrap.dedent( + f""" + SELECT * EXCEPT ({offsets_id}) + FROM ({sql}) + ORDER BY {offsets_id} + """ + ) return typing.cast(str, sql) def _to_ibis_expr( self, ordering_mode: Literal[ - "order_by", "string_encoded", "offset_col", "unordered" - ] = "order_by", + "string_encoded", "offset_col", "unordered" + ] = "unordered", order_col_name: Optional[str] = ORDER_ID_COLUMN, expose_hidden_cols: bool = False, fraction: Optional[float] = None, @@ -731,8 +722,6 @@ def _to_ibis_expr( ArrayValue objects are sorted, so the following options are available to reflect this in the ibis expression. - * "order_by" (Default): The output table will not have an ordering - column, however there will be an order_by clause applied to the ouput. * "offset_col": Zero-based offsets are generated as a column, this will not sort the rows however. * "string_encoded": An ordered string column is provided in output table. @@ -760,7 +749,6 @@ def _to_ibis_expr( An ibis expression representing the data help by the ArrayValue object. """ assert ordering_mode in ( - "order_by", "string_encoded", "offset_col", "unordered", @@ -775,18 +763,16 @@ def _to_ibis_expr( str ] = [] # Ordering/Filtering columns that will be dropped at end - if self.reduced_predicate is not None: - columns.append(self.reduced_predicate) + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) # Usually drop predicate as it is will be all TRUE after filtering if not expose_hidden_cols: - columns_to_drop.append(self.reduced_predicate.get_name()) + columns_to_drop.append(self._reduced_predicate.get_name()) order_columns = self._create_order_columns( ordering_mode, order_col_name, expose_hidden_cols ) columns.extend(order_columns) - if (ordering_mode == "order_by") and not expose_hidden_cols: - columns_to_drop.extend(col.get_name() for col in order_columns) # Special case for empty tables, since we can't create an empty # projection. @@ -799,15 +785,8 @@ def _to_ibis_expr( bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns ) base_table = table - if self.reduced_predicate is not None: + if self._reduced_predicate is not None: table = table.filter(base_table[PREDICATE_COLUMN]) - if ordering_mode == "order_by": - table = table.order_by( - _convert_ordering_to_table_values( - {col: base_table[col] for col in table.columns}, - self._ordering.all_ordering_columns, - ) # type: ignore - ) table = table.drop(*columns_to_drop) if col_id_overrides: table = table.relabel(col_id_overrides) @@ -826,24 +805,24 @@ def _create_order_columns( return (self._create_offset_column().name(order_col_name),) elif ordering_mode == "string_encoded": return (self._create_string_ordering_column().name(order_col_name),) - elif ordering_mode == "order_by" or expose_hidden_cols: + elif expose_hidden_cols: return self.hidden_ordering_columns return () def _create_offset_column(self) -> ibis_types.IntegerColumn: if self._ordering.total_order_col and self._ordering.is_sequential: - offsets = self.get_any_column(self._ordering.total_order_col.column_id) + offsets = self._get_any_column(self._ordering.total_order_col.column_id) return typing.cast(ibis_types.IntegerColumn, offsets) else: window = ibis.window(order_by=self._ibis_order) if self._predicates: - window = window.group_by(self.reduced_predicate) + window = window.group_by(self._reduced_predicate) offsets = ibis.row_number().over(window) return typing.cast(ibis_types.IntegerColumn, offsets) def _create_string_ordering_column(self) -> ibis_types.StringColumn: if self._ordering.total_order_col and self._ordering.is_string_encoded: - string_order_ids = self.get_any_column( + string_order_ids = self._get_any_column( self._ordering.total_order_col.column_id ) return typing.cast(ibis_types.StringColumn, string_order_ids) @@ -852,7 +831,7 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: and self._ordering.integer_encoding.is_encoded ): # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers - int_values = self.get_any_column(self._ordering.total_order_col.column_id) + int_values = self._get_any_column(self._ordering.total_order_col.column_id) return encode_order_string( typing.cast(ibis_types.IntegerColumn, int_values), ) @@ -860,7 +839,7 @@ def _create_string_ordering_column(self) -> ibis_types.StringColumn: # Have to build string from scratch window = ibis.window(order_by=self._ibis_order) if self._predicates: - window = window.group_by(self.reduced_predicate) + window = window.group_by(self._reduced_predicate) row_nums = typing.cast( ibis_types.IntegerColumn, ibis.row_number().over(window) ) @@ -870,7 +849,8 @@ def start_query( self, job_config: Optional[bigquery.job.QueryJobConfig] = None, max_results: Optional[int] = None, - expose_extra_columns: bool = False, + *, + sorted: bool = True, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """Execute a query and return metadata about the results.""" # TODO(swast): Cache the job ID so we can look it up again if they ask @@ -883,8 +863,9 @@ def start_query( # a LocalSession for unit testing. # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? - table = self._to_ibis_expr(expose_hidden_cols=expose_extra_columns) - sql = self._session.ibis_client.compile(table) # type:ignore + + # DO NOT COMMIT: Make this ordered + sql = self.to_sql(sorted=True) # type:ignore return self._session._start_query( sql=sql, job_config=job_config, @@ -903,7 +884,6 @@ def _reproject_to_table(self) -> ArrayValue: recursively in projections. """ table = self._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True, ) columns = [table[column_name] for column_name in self._column_names] @@ -926,14 +906,16 @@ def _reproject_to_table(self) -> ArrayValue: def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): group_by: typing.List[ibis_types.Value] = ( [ - typing.cast(ibis_types.Column, _as_identity(self.get_column(column))) + typing.cast( + ibis_types.Column, _as_identity(self._get_ibis_column(column)) + ) for column in window_spec.grouping_keys ] if window_spec.grouping_keys else [] ) - if self.reduced_predicate is not None: - group_by.append(self.reduced_predicate) + if self._reduced_predicate is not None: + group_by.append(self._reduced_predicate) if window_spec.ordering: order_by = _convert_ordering_to_table_values( {**self._column_names, **self._hidden_ordering_column_names}, @@ -984,7 +966,7 @@ def unpivot( """ if how not in ("left", "right"): raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + table = self._to_ibis_expr(expose_hidden_cols=True) row_n = len(row_labels) hidden_col_ids = self._hidden_ordering_column_names.keys() if not all( @@ -1107,7 +1089,9 @@ def unpivot( ) def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return self._set_or_replace_by_id(destination_id, self.get_column(source_id)) + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) def assign_constant( self, @@ -1134,10 +1118,10 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayVa return self._hide_column(id)._set_or_replace_by_id(id, new_value) builder = self.builder() - if id in self.column_names: + if id in self.column_ids: builder.columns = [ val if (col_id != id) else new_value.name(id) - for col_id, val in self.column_names.items() + for col_id, val in zip(self.column_ids, self._columns) ] else: builder.columns = [*self.columns, new_value.name(id)] @@ -1155,12 +1139,12 @@ def slice( if not step: step = 1 - expr_with_offsets = self.project_offsets() + expr_with_offsets = self._project_offsets() # start with True and reduce with start, stop, and step conditions - cond_list = [expr_with_offsets.offsets == expr_with_offsets.offsets] + cond_list = [expr_with_offsets._offsets == expr_with_offsets._offsets] - last_offset = expr_with_offsets.offsets.max() + last_offset = expr_with_offsets._offsets.max() # Convert negative indexes to positive indexes if start and start < 0: @@ -1170,20 +1154,20 @@ def slice( if start is not None: if step >= 1: - cond_list.append(expr_with_offsets.offsets >= start) + cond_list.append(expr_with_offsets._offsets >= start) else: - cond_list.append(expr_with_offsets.offsets <= start) + cond_list.append(expr_with_offsets._offsets <= start) if stop is not None: if step >= 1: - cond_list.append(expr_with_offsets.offsets < stop) + cond_list.append(expr_with_offsets._offsets < stop) else: - cond_list.append(expr_with_offsets.offsets > stop) + cond_list.append(expr_with_offsets._offsets > stop) if step > 1: start = start if (start is not None) else 0 - cond_list.append((expr_with_offsets.offsets - start) % step == 0) + cond_list.append((expr_with_offsets._offsets - start) % step == 0) if step < 0: start = start if (start is not None) else last_offset - cond_list.append((start - expr_with_offsets.offsets) % (-step) == 0) + cond_list.append((start - expr_with_offsets._offsets) % (-step) == 0) sliced_expr = expr_with_offsets._filter( functools.reduce(lambda x, y: x & y, cond_list) @@ -1192,16 +1176,14 @@ def slice( def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - ibis_expr = self._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True - ) + ibis_expr = self._to_ibis_expr(expose_hidden_cols=True) destination = self._session._ibis_to_session_table( ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) table_expression = self._session.ibis_client.sql( f"SELECT * FROM `_SESSION`.`{destination.table_id}`" ) - new_columns = [table_expression[column] for column in self.column_names] + new_columns = [table_expression[column] for column in self.column_ids] new_hidden_columns = [ table_expression[column] for column in self._hidden_ordering_column_names ] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 863852c684..510ba32e26 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -151,7 +151,7 @@ def value_columns(self) -> Sequence[str]: """All value columns, mutually exclusive with index columns.""" return [ column - for column in self._expr.column_names + for column in self._expr.column_ids if column not in self.index_columns ] @@ -443,9 +443,7 @@ def _compute_and_count( # TODO(swast): Allow for dry run and timeout. expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = expr.start_query( - max_results=max_results, expose_extra_columns=True - ) + results_iterator, query_job = expr.start_query(max_results=max_results) table_size = expr._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES fraction = ( @@ -482,12 +480,6 @@ def _compute_and_count( if self.index_columns: df.set_index(list(self.index_columns), inplace=True) df.index.names = self.index.names # type: ignore - - df.drop( - [col for col in df.columns if col not in self.value_columns], - axis=1, - inplace=True, - ) elif (sampling_method == _UNIFORM) and (random_state is None): filtered_expr = self.expr._uniform_sampling(fraction) block = Block( @@ -519,12 +511,6 @@ def _compute_and_count( df.set_index(list(self.index_columns), inplace=True) df.index.names = self.index.names # type: ignore - df.drop( - [col for col in df.columns if col not in self.value_columns], - axis=1, - inplace=True, - ) - return df, total_rows, query_job def _split( @@ -1086,7 +1072,7 @@ def _normalize_expression( ): """Normalizes expression by moving index columns to left.""" value_columns = [ - col_id for col_id in expr.column_names.keys() if col_id not in index_columns + col_id for col_id in expr.column_ids if col_id not in index_columns ] if (assert_value_size is not None) and ( len(value_columns) != assert_value_size @@ -1095,7 +1081,7 @@ def _normalize_expression( return expr.select_columns([*index_columns, *value_columns]) def slice( - self: bigframes.core.blocks.Block, + self, start: typing.Optional[int] = None, stop: typing.Optional[int] = None, step: typing.Optional[int] = None, @@ -1395,7 +1381,7 @@ def concat( ) result_block = Block( result_expr, - index_columns=list(result_expr.column_names.keys())[:index_nlevels], + index_columns=list(result_expr.column_ids)[:index_nlevels], column_labels=aligned_blocks[0].column_labels, index_labels=result_labels, ) @@ -1457,9 +1443,7 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id - sql = array_value.to_sql( - ordering_mode="unordered", col_id_overrides=substitutions - ) + sql = array_value.to_sql(col_id_overrides=substitutions) return ( sql, new_ids[: len(idx_labels)], diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 9be7f22a71..db0843fcbc 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -426,10 +426,6 @@ def __init__( self._value_name = value_name self._dropna = dropna # Applies to aggregations but not windowing - @property - def _value(self): - return self._block.expr.get_column(self._value_column) - def all(self) -> series.Series: return self._aggregate(agg_ops.all_op) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index f211afe4d5..7d15e67649 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -398,9 +398,7 @@ def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) - expr = self._expr.projection( - [self._expr.get_any_column(col) for col in index_columns] - ) + expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() df = expr._session._rows_to_dataframe(results) df = df.set_index(index_columns) diff --git a/bigframes/core/joins/row_identity.py b/bigframes/core/joins/row_identity.py index 66eb223990..156e7aef40 100644 --- a/bigframes/core/joins/row_identity.py +++ b/bigframes/core/joins/row_identity.py @@ -38,11 +38,11 @@ def join_by_row_identity( f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" ) - if not left.table.equals(right.table): + if not left._table.equals(right._table): raise ValueError( "Cannot combine objects without an explicit join/merge key. " - f"Left based on: {left.table.compile()}, but " - f"right based on: {right.table.compile()}" + f"Left based on: {left._table.compile()}, but " + f"right based on: {right._table.compile()}" ) left_predicates = left._predicates @@ -63,11 +63,11 @@ def join_by_row_identity( left_mask = left_relative_predicates if how in ["right", "outer"] else None right_mask = right_relative_predicates if how in ["left", "outer"] else None joined_columns = [ - _mask_value(left.get_column(key), left_mask).name(map_left_id(key)) - for key in left.column_names.keys() + _mask_value(left._get_ibis_column(key), left_mask).name(map_left_id(key)) + for key in left.column_ids ] + [ - _mask_value(right.get_column(key), right_mask).name(map_right_id(key)) - for key in right.column_names.keys() + _mask_value(right._get_ibis_column(key), right_mask).name(map_right_id(key)) + for key in right.column_ids ] # If left isn't being masked, can just use left ordering @@ -108,7 +108,7 @@ def join_by_row_identity( joined_expr = core.ArrayValue( left._session, - left.table, + left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, ordering=new_ordering, diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 8a9825cf0b..ea138aa885 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -79,14 +79,14 @@ def join_by_column( if ( allow_row_identity_join and how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW - and left.table.equals(right.table) + and left._table.equals(right._table) # Make sure we're joining on exactly the same column(s), at least with # regards to value its possible that they both have the same names but # were modified in different ways. Ignore differences in the names. and all( - left.get_any_column(lcol) + left._get_any_column(lcol) .name("index") - .equals(right.get_any_column(rcol).name("index")) + .equals(right._get_any_column(rcol).name("index")) for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): @@ -95,16 +95,18 @@ def join_by_column( get_column_right, ) = bigframes.core.joins.row_identity.join_by_row_identity(left, right, how=how) left_join_keys = [ - combined_expr.get_column(get_column_left(col)) for col in left_column_ids + combined_expr._get_ibis_column(get_column_left(col)) + for col in left_column_ids ] right_join_keys = [ - combined_expr.get_column(get_column_right(col)) for col in right_column_ids + combined_expr._get_ibis_column(get_column_right(col)) + for col in right_column_ids ] join_key_cols = get_join_cols( left_join_keys, right_join_keys, how, coalesce_join_keys ) join_key_ids = [col.get_name() for col in join_key_cols] - combined_expr = combined_expr.projection( + combined_expr = combined_expr._projection( [*join_key_cols, *combined_expr.columns] ) if sort: @@ -126,13 +128,13 @@ def join_by_column( lmapping = { col_id: guid.generate_guid() for col_id in itertools.chain( - left.column_names, left._hidden_ordering_column_names + left.column_ids, left._hidden_ordering_column_names ) } rmapping = { col_id: guid.generate_guid() for col_id in itertools.chain( - right.column_names, right._hidden_ordering_column_names + right.column_ids, right._hidden_ordering_column_names ) } @@ -143,12 +145,10 @@ def get_column_right(col_id): return rmapping[col_id] left_table = left._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=lmapping, ) right_table = right._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True, col_id_overrides=rmapping, ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ea06e28cdf..5d960d6113 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -547,7 +547,7 @@ def _apply_series_binop( other._block.index, how=how ) - series_column_id = other._value.get_name() + series_column_id = other._value_column series_col = get_column_right(series_column_id) block = joined_index._block for column_id, label in zip( @@ -2404,13 +2404,11 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: if ordering_id is not None: return array_value.to_sql( - ordering_mode="offset_col", + offset_column=ordering_id, col_id_overrides=id_overrides, - order_col_name=ordering_id, ) else: return array_value.to_sql( - ordering_mode="unordered", col_id_overrides=id_overrides, ) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index add6af57f4..f6b47f975e 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -16,7 +16,6 @@ import typing -import ibis.expr.types as ibis_types import pandas as pd import bigframes.constants as constants @@ -98,11 +97,6 @@ def __init__( if pd_series.name is None: self._block = self._block.with_column_labels([None]) - @property - def _value(self) -> ibis_types.Value: - """Private property to get Ibis expression for the value column.""" - return self._block.expr.get_column(self._value_column) - @property def _value_column(self) -> str: return self._block.value_columns[0] diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index ee0cefb3d2..ba789cab31 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ibis import ibis.expr.types as ibis_types import pandas @@ -45,7 +44,7 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): actual = core.ArrayValue( session=session, table=ibis_table, columns=columns, ordering=ordering ) - assert actual.table is ibis_table + assert actual._table is ibis_table assert len(actual.columns) == 3 @@ -79,37 +78,12 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value.get_column("col1") + col1 = value._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() -def test_arrayvalue_to_ibis_expr_with_projection(): - value = resources.create_arrayvalue( - pandas.DataFrame( - { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": [0.1, 0.2, 0.3], - } - ), - total_ordering_columns=["col1"], - ) - expr = value.projection( - [ - (value.table["col1"] + ibis.literal(-1)).name("int64_col"), - ibis.literal(123456789).name("literals"), - value.table["col2"].name("string_col"), - ] - ) - actual = expr._to_ibis_expr() - assert len(actual.columns) == 3 - assert actual.columns[0] == "int64_col" - assert actual.columns[1] == "literals" - assert actual.columns[2] == "string_col" - - def test_arrayvalues_to_ibis_expr_with_get_column(): value = resources.create_arrayvalue( pandas.DataFrame( @@ -121,7 +95,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value.get_column("col1") + expr = value._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() From 7d7a5077246e77a3a6b1240544197130267dbcd9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 3 Oct 2023 07:36:03 +0000 Subject: [PATCH 2/6] move slice completely to block --- bigframes/core/__init__.py | 52 ++-------------------- bigframes/core/blocks.py | 91 ++++++++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 57 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index d94b60c866..c4ebc6f07c 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -351,7 +351,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: ) @property - def _offsets(self) -> ibis_types.Value: + def _offsets(self) -> ibis_types.IntegerColumn: if not self._ordering.is_sequential: raise ValueError( "Expression does not have offsets. Generate them first using project_offsets." @@ -360,7 +360,8 @@ def _offsets(self) -> ibis_types.Value: raise ValueError( "Ordering is invalid. Marked as sequential but no total order columns." ) - return self._get_any_column(self._ordering.total_order_col.column_id) + column = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, column) def _project_offsets(self) -> ArrayValue: """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" @@ -1127,53 +1128,6 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayVa builder.columns = [*self.columns, new_value.name(id)] return builder.build() - def slice( - self, - start: typing.Optional[int] = None, - stop: typing.Optional[int] = None, - step: typing.Optional[int] = None, - ) -> ArrayValue: - if step == 0: - raise ValueError("slice step cannot be zero") - - if not step: - step = 1 - - expr_with_offsets = self._project_offsets() - - # start with True and reduce with start, stop, and step conditions - cond_list = [expr_with_offsets._offsets == expr_with_offsets._offsets] - - last_offset = expr_with_offsets._offsets.max() - - # Convert negative indexes to positive indexes - if start and start < 0: - start = last_offset + start + 1 - if stop and stop < 0: - stop = last_offset + stop + 1 - - if start is not None: - if step >= 1: - cond_list.append(expr_with_offsets._offsets >= start) - else: - cond_list.append(expr_with_offsets._offsets <= start) - if stop is not None: - if step >= 1: - cond_list.append(expr_with_offsets._offsets < stop) - else: - cond_list.append(expr_with_offsets._offsets > stop) - if step > 1: - start = start if (start is not None) else 0 - cond_list.append((expr_with_offsets._offsets - start) % step == 0) - if step < 0: - start = start if (start is not None) else last_offset - cond_list.append((start - expr_with_offsets._offsets) % (-step) == 0) - - sliced_expr = expr_with_offsets._filter( - functools.reduce(lambda x, y: x & y, cond_list) - ) - return sliced_expr if step > 0 else sliced_expr.reversed() - def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" ibis_expr = self._to_ibis_expr(expose_hidden_cols=True) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 510ba32e26..c793faa00e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1086,15 +1086,90 @@ def slice( stop: typing.Optional[int] = None, step: typing.Optional[int] = None, ) -> bigframes.core.blocks.Block: - sliced_expr = self.expr.slice(start=start, stop=stop, step=step) - # since this is slice, return a copy even if unchanged - block = Block( - sliced_expr, - index_columns=self.index_columns, - column_labels=self.column_labels, - index_labels=self._index_labels, + start = start or 0 + if step is None: + step = 1 + if step == 0: + raise ValueError("slice step cannot be zero") + if step < 0: + adj_start = -start + 1 if start > 0 else -start - 1 + if stop: + adj_stop = -stop + 1 if stop > 0 else -stop - 1 + else: + adj_stop = None + adj_step = -step + return ( + self.reversed()._forward_slice(adj_start, adj_stop, adj_step).reversed() + ) + return self._forward_slice(start or 0, stop, step) + + def _forward_slice(self, start: int = 0, stop=None, step: int = 1): + """Performs slice but only for positive step size.""" + if step <= 0: + raise ValueError("forward_slice only supports positive step size") + + use_postive_offsets = ( + (start > 0) + or ((stop is not None) and (stop >= 0)) + or ((step > 1) and (start >= 0)) ) - return block + use_negative_offsets = ( + (start < 0) or (stop and (stop < 0)) or ((step > 1) and (start < 0)) + ) + + block = self + + # only generate offsets that are used + positive_offsets = None + negative_offsets = None + if use_postive_offsets: + block, positive_offsets = self.promote_offsets() + if use_negative_offsets: + block, negative_offsets = block.reversed().promote_offsets() + block = block.reversed() + + conditions = [] + if start != 0: + if start > 0: + op = ops.partial_right(ops.ge_op, start) + assert positive_offsets + block, start_cond = block.apply_unary_op(positive_offsets, op) + else: + op = ops.partial_right(ops.le_op, -start - 1) + assert negative_offsets + block, start_cond = block.apply_unary_op(negative_offsets, op) + conditions.append(start_cond) + if stop is not None: + if stop >= 0: + op = ops.partial_right(ops.lt_op, stop) + assert positive_offsets + block, stop_cond = block.apply_unary_op(positive_offsets, op) + else: + op = ops.partial_right(ops.gt_op, -stop - 1) + assert negative_offsets + block, stop_cond = block.apply_unary_op(negative_offsets, op) + conditions.append(stop_cond) + + if step > 1: + op = ops.partial_right(ops.mod_op, step) + if start >= 0: + op = ops.partial_right(ops.sub_op, start) + assert positive_offsets + block, start_diff = block.apply_unary_op(positive_offsets, op) + else: + op = ops.partial_right(ops.sub_op, -start + 1) + assert negative_offsets + block, start_diff = block.apply_unary_op(negative_offsets, op) + modulo_op = ops.partial_right(ops.mod_op, step) + block, mod = block.apply_unary_op(start_diff, modulo_op) + is_zero_op = ops.partial_right(ops.eq_op, 0) + block, step_cond = block.apply_unary_op(mod, is_zero_op) + conditions.append(step_cond) + + for cond in conditions: + block = block.filter(cond) + + return block.select_columns(self.value_columns) # Using cache to optimize for Jupyter Notebook's behavior where both '__repr__' # and '__repr_html__' are called in a single display action, reducing redundant From 589b8f92311d92b9c798b2d22ae102dd0203c457 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 3 Oct 2023 17:36:46 +0000 Subject: [PATCH 3/6] fix new iloc impl --- bigframes/core/blocks.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c793faa00e..0760e7e9a0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1086,20 +1086,16 @@ def slice( stop: typing.Optional[int] = None, step: typing.Optional[int] = None, ) -> bigframes.core.blocks.Block: - start = start or 0 if step is None: step = 1 if step == 0: raise ValueError("slice step cannot be zero") if step < 0: - adj_start = -start + 1 if start > 0 else -start - 1 - if stop: - adj_stop = -stop + 1 if stop > 0 else -stop - 1 - else: - adj_stop = None - adj_step = -step - return ( - self.reversed()._forward_slice(adj_start, adj_stop, adj_step).reversed() + reverse_start = (-start - 1) if start else 0 + reverse_stop = (-stop - 1) if stop else None + reverse_step = -step + return self.reversed()._forward_slice( + reverse_start, reverse_stop, reverse_step ) return self._forward_slice(start or 0, stop, step) @@ -1122,6 +1118,7 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): # only generate offsets that are used positive_offsets = None negative_offsets = None + if use_postive_offsets: block, positive_offsets = self.promote_offsets() if use_negative_offsets: From ec3d2f0a72e0916ee020fad987ca9ced589a8343 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 5 Oct 2023 18:06:50 +0000 Subject: [PATCH 4/6] Add reproject to prevent delayed filter application in series.apply --- bigframes/series.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 2a0b1ff1fa..717a85a93e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1150,7 +1150,11 @@ def _groupby_values( def apply(self, func) -> Series: # TODO(shobs, b/274645634): Support convert_dtype, args, **kwargs # is actually a ternary op - return self._apply_unary_op(ops.RemoteFunctionOp(func)) + # Reproject as workaround to applying filter too late. This forces the filter + # to be applied before passing data to remote function, protecting from bad + # inputs causing errors. + reprojected_series = Series(self._block._force_reproject()) + return reprojected_series._apply_unary_op(ops.RemoteFunctionOp(func)) def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) From 45bfc43f3231e513064ab8d3aff25b4fb41c49ef Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 5 Oct 2023 19:11:35 +0000 Subject: [PATCH 5/6] pr comments --- bigframes/core/__init__.py | 25 ++++++++++++------------- bigframes/core/joins/single_column.py | 2 ++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 724c9904f3..ccfd682215 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -337,7 +337,9 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: .. warning:: The row numbers of result is non-deterministic, avoid to use. """ - table = self._to_ibis_expr(expose_hidden_cols=True, fraction=fraction) + table = self._to_ibis_expr( + "unordered", expose_hidden_cols=True, fraction=fraction + ) columns = [table[column_name] for column_name in self._column_names] hidden_ordering_columns = [ table[column_name] for column_name in self._hidden_ordering_column_names @@ -440,7 +442,7 @@ def _projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" width = len(self.columns) - count_expr = self._to_ibis_expr().count() + count_expr = self._to_ibis_expr("unordered").count() sql = self._session.ibis_client.compile(count_expr) # Support in-memory engines for hermetic unit tests. @@ -556,7 +558,7 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self._to_ibis_expr() + table = self._to_ibis_expr("unordered") stats = { col_out: agg_op._as_ibis(table[col_in]) for col_in, agg_op, col_out in aggregations @@ -606,7 +608,7 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self._to_ibis_expr() + table = self._to_ibis_expr("unordered") stats = { col_out: table[col_left].corr(table[col_right], how="pop") for col_left, col_right, col_out in corr_aggregations @@ -700,18 +702,16 @@ def to_sql( if sorted: sql = textwrap.dedent( f""" - SELECT * EXCEPT ({offsets_id}) + SELECT * EXCEPT (`{offsets_id}`) FROM ({sql}) - ORDER BY {offsets_id} + ORDER BY `{offsets_id}` """ ) return typing.cast(str, sql) def _to_ibis_expr( self, - ordering_mode: Literal[ - "string_encoded", "offset_col", "unordered" - ] = "unordered", + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], order_col_name: Optional[str] = ORDER_ID_COLUMN, expose_hidden_cols: bool = False, fraction: Optional[float] = None, @@ -864,8 +864,6 @@ def start_query( # a LocalSession for unit testing. # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? - - # DO NOT COMMIT: Make this ordered sql = self.to_sql(sorted=True) # type:ignore return self._session._start_query( sql=sql, @@ -885,6 +883,7 @@ def _reproject_to_table(self) -> ArrayValue: recursively in projections. """ table = self._to_ibis_expr( + "unordered", expose_hidden_cols=True, ) columns = [table[column_name] for column_name in self._column_names] @@ -967,7 +966,7 @@ def unpivot( """ if how not in ("left", "right"): raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr(expose_hidden_cols=True) + table = self._to_ibis_expr("unordered", expose_hidden_cols=True) row_n = len(row_labels) hidden_col_ids = self._hidden_ordering_column_names.keys() if not all( @@ -1130,7 +1129,7 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayVa def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: """Write the ArrayValue to a session table and create a new block object that references it.""" - ibis_expr = self._to_ibis_expr(expose_hidden_cols=True) + ibis_expr = self._to_ibis_expr("unordered", expose_hidden_cols=True) destination = self._session._ibis_to_session_table( ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 09b2594f2f..f194b8f8c4 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -138,10 +138,12 @@ def get_column_right(col_id): return rmapping[col_id] left_table = left._to_ibis_expr( + "unordered", expose_hidden_cols=True, col_id_overrides=lmapping, ) right_table = right._to_ibis_expr( + "unordered", expose_hidden_cols=True, col_id_overrides=rmapping, ) From 163a84b549656f49f472cc4d3ef357287e369045 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 5 Oct 2023 19:18:32 +0000 Subject: [PATCH 6/6] fix unit tests --- tests/unit/test_core.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index ba789cab31..69b9e79807 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -112,7 +112,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._to_ibis_expr() + actual = expr._to_ibis_expr("unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -149,7 +149,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ) expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") assert expr.columns[3].type().is_float64() - actual = expr._to_ibis_expr() + actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 4 assert actual.columns[3] == "col4" @@ -168,7 +168,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") assert expr.columns[4].type().is_float64() - actual = expr._to_ibis_expr() + actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 5 assert actual.columns[4] == "col5" @@ -189,7 +189,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): by_column_ids=["col1"], dropna=False, ) - actual = expr._to_ibis_expr() + actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" @@ -208,7 +208,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): total_ordering_columns=["col1"], ) expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) - actual = expr._to_ibis_expr() + actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" assert expr.columns[0].type().is_float64()