From 0a81b9ef5e9c4932d9046679237b8dd087f3a595 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 18:57:17 +0000 Subject: [PATCH 1/6] refactor: combine all projection nodes into single node type --- bigframes/core/__init__.py | 124 ++++++++++++++----- bigframes/core/blocks.py | 18 +-- bigframes/core/compile/compiled.py | 104 ++++++---------- bigframes/core/compile/compiler.py | 26 +--- bigframes/core/compile/scalar_op_compiler.py | 16 ++- bigframes/core/expression.py | 13 ++ bigframes/core/indexes/index.py | 2 +- bigframes/core/nodes.py | 39 +----- tests/unit/test_core.py | 8 +- 9 files changed, 173 insertions(+), 177 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e32977fbce..8c08698b93 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -22,7 +22,7 @@ import pandas import bigframes.core.compile as compiling -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -114,12 +114,6 @@ def row_count(self) -> ArrayValue: return ArrayValue(nodes.RowCountNode(child=self.node)) # Operations - - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - return ArrayValue( - nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) - ) - def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" return ArrayValue( @@ -140,21 +134,104 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return ArrayValue( - nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) - ) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" return ArrayValue( nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def project(self, expression: expressions.Expression, output_id: str): + def project_to_id(self, expression: ex.Expression, output_id: str): + if output_id in self.column_ids: # Mutate case + exprs = [ + ((expression if (col_id == output_id) else ex.free_var(col_id)), col_id) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (expression, output_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def assign(self, source_id: str, destination_id: str) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.free_var(source_id) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.free_var(source_id), destination_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.const(value, dtype) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.const(value, dtype), destination_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + selections = ((ex.free_var(col_id), col_id) for col_id in column_ids) + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(selections), + ) + ) + + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + new_projection = ( + (ex.free_var(col_id), col_id) + for col_id in self.column_ids + if col_id not in columns + ) return ArrayValue( nodes.ProjectionNode( - child=self.node, assignments=((expression, output_id),) + child=self.node, + assignments=tuple(new_projection), ) ) @@ -277,25 +354,6 @@ def unpivot( ) ) - def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return ArrayValue( - nodes.AssignNode( - child=self.node, source_id=source_id, destination_id=destination_id - ) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> ArrayValue: - return ArrayValue( - nodes.AssignConstantNode( - child=self.node, destination_id=destination_id, value=value, dtype=dtype - ) - ) - def join( self, self_column_ids: typing.Sequence[str], diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cf1a8cb7c0..73e3460ab7 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -670,7 +670,7 @@ def apply_unary_op( """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - expr = self._expr.project(op.as_expr(column), result_id) + expr = self._expr.project_to_id(op.as_expr(column), result_id) block = Block( expr, index_columns=self.index_columns, @@ -687,7 +687,7 @@ def apply_binary_op( result_label: Label = None, ) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() - expr = self._expr.project( + expr = self._expr.project_to_id( op.as_expr(left_column_id, right_column_id), result_id ) block = Block( @@ -707,7 +707,9 @@ def apply_ternary_op( result_label: Label = None, ) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() - expr = self._expr.project(op.as_expr(col_id_1, col_id_2, col_id_3), result_id) + expr = self._expr.project_to_id( + op.as_expr(col_id_1, col_id_2, col_id_3), result_id + ) block = Block( expr, index_columns=self.index_columns, @@ -1238,12 +1240,12 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project( + expr = expr.project_to_id( expression=ops.AsTypeOp(to_type="string").as_expr(index_col), output_id=index_col, ) prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) - expr = expr.project( + expr = expr.project_to_id( expression=prefix_op.as_expr(index_col), output_id=index_col ) return Block( @@ -1262,12 +1264,12 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project( + expr = expr.project_to_id( expression=ops.AsTypeOp(to_type="string").as_expr(index_col), output_id=index_col, ) prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) - expr = expr.project( + expr = expr.project_to_id( expression=prefix_op.as_expr(index_col), output_id=index_col ) return Block( @@ -1576,7 +1578,7 @@ def merge( coalesced_ids = [] for left_id, right_id in zip(left_join_ids, right_join_ids): coalesced_id = guid.generate_guid() - joined_expr = joined_expr.project( + joined_expr = joined_expr.project_to_id( ops.coalesce_op.as_expr( get_column_left[left_id], get_column_right[right_id] ), diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6a3db3f2bd..d73808e732 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,9 +26,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants import bigframes.core.compile.scalar_op_compiler as op_compilers -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -96,16 +95,6 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: else None ) - @abc.abstractmethod - def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: - """Creates a new expression based on this expression with new columns.""" - ... - - def drop_columns(self: T, columns: Iterable[str]) -> T: - return self.select_columns( - [col for col in self.column_ids if col not in columns] - ) - @abc.abstractmethod def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" @@ -152,40 +141,21 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_expression( + def projection( self: T, - expression: expressions.Expression, - output_column_id: typing.Optional[str] = None, + expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], ) -> T: """Apply an expression to the ArrayValue and assign the output to a column.""" - result_id = ( - output_column_id or expression.unbound_variables[0] - ) # overwrite input if not output id provided - bindings = { - col: self._get_ibis_column(col) for col in expression.unbound_variables - } - value = op_compiler.compile_expression(expression, bindings).name(result_id) - return self._set_or_replace_by_id(result_id, value) - - def assign(self: T, source_id: str, destination_id: str) -> T: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} + values = [ + op_compiler.compile_expression(expression, bindings).name(id) + for expression, id in expression_id_pairs + ] + return self._select(tuple(values)) # type: ignore - def assign_constant( - self: T, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> T: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() + @abc.abstractmethod + def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T: + ... @abc.abstractmethod def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: @@ -330,14 +300,6 @@ def _to_ibis_expr( table = table.filter(ibis.random() < ibis.literal(fraction)) return table - def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - builder = self.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) @@ -577,6 +539,11 @@ def _set_or_replace_by_id( builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> UnorderedIR: + builder = self.builder() + builder.columns = values + return builder.build() + def _reproject_to_table(self) -> UnorderedIR: """ Internal operators that projects the internal representation into a @@ -816,20 +783,6 @@ def promote_offsets(self, col_id: str) -> OrderedIR: ] return expr_builder.build() - def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - ## Methods that only work with ordering def project_window_op( self, @@ -1221,6 +1174,29 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> Ordered builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + ir = self + mappings = {value.name: value for value in values} + for ordering_id in ordering_col_ids: + # Drop case + if (ordering_id not in mappings) and (ordering_id in ir.column_ids): + # id is being dropped, hide it first + ir = ir._hide_column(ordering_id) + # Mutate case + elif (ordering_id in mappings) and not mappings[ordering_id].equals( + ir._get_any_column(ordering_id) + ): + ir = ir._hide_column(ordering_id) + + builder = ir.builder() + builder.columns = list(values) + return builder.build() + ## Ordering specific helpers def _get_any_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column. Will also get hidden columns.""" diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index eacee26bcb..18fcd73d19 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -79,16 +79,6 @@ def compile_join(node: nodes.JoinNode, ordered: bool = True): ) -@_compile_node.register -def compile_select(node: nodes.SelectNode, ordered: bool = True): - return compile_node(node.child, ordered).select_columns(node.column_ids) - - -@_compile_node.register -def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): - return compile_node(node.child, ordered).drop_columns(node.columns) - - @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) @@ -145,9 +135,7 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): result = compile_node(node.child, ordered) - for expr, id in node.assignments: - result = result.project_expression(expr, id) - return result + return result.projection(node.assignments) @_compile_node.register @@ -210,18 +198,6 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): ) -@_compile_node.register -def compile_assign(node: nodes.AssignNode, ordered: bool = True): - return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) - - -@_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): - return compile_node(node.child, ordered).assign_constant( - node.destination_id, node.value, node.dtype - ) - - @_compile_node.register def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 2331d3aa28..1b85192eb4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,7 +26,7 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations as ops @@ -54,7 +54,7 @@ class ScalarOpCompiler: @functools.singledispatchmethod def compile_expression( self, - expression: expressions.Expression, + expression: ex.Expression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: raise NotImplementedError(f"Unrecognized expression: {expression}") @@ -62,15 +62,19 @@ def compile_expression( @compile_expression.register def _( self, - expression: expressions.ScalarConstantExpression, + expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - return ibis.literal(expression.value) + if pd.isnull(expression.value): # type: ignore + return ibis.null() + return ibis.literal( + expression.value, dtypes.bigframes_dtype_to_ibis_dtype(expression.dtype) + ) @compile_expression.register def _( self, - expression: expressions.UnboundVariableExpression, + expression: ex.UnboundVariableExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: if expression.id not in bindings: @@ -81,7 +85,7 @@ def _( @compile_expression.register def _( self, - expression: expressions.OpExpression, + expression: ex.OpExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: inputs = [ diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 2fb1ccb988..e2f1c57856 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -18,10 +18,22 @@ import dataclasses import itertools import typing +from typing import Optional +import bigframes.dtypes import bigframes.operations +def const( + value: typing.Hashable, dtype: Optional[bigframes.dtypes.Dtype] +) -> Expression: + return ScalarConstantExpression(value) + + +def free_var(id: str) -> Expression: + return UnboundVariableExpression(id) + + @dataclasses.dataclass(frozen=True) class Expression(abc.ABC): """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" @@ -37,6 +49,7 @@ class ScalarConstantExpression(Expression): # TODO: Further constrain? value: typing.Hashable + dtype: Optional[bigframes.dtypes.Dtype] = None @dataclasses.dataclass(frozen=True) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6602170b5f..18d8d24a99 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -604,7 +604,7 @@ def coalesce_columns( expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = bigframes.core.guid.generate_guid() - expr = expr.project( + expr = expr.project_to_id( ops.coalesce_op.as_expr(left_id, right_id), coalesced_id ) expr = expr.drop_columns([left_id, right_id]) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 360f853e3e..d30db9a7f7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -21,7 +21,7 @@ import pandas -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingColumnReference import bigframes.core.window_spec as window @@ -145,14 +145,6 @@ def __hash__(self): # Unary nodes -@dataclass(frozen=True) -class DropColumnsNode(UnaryNode): - columns: Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): col_id: str @@ -187,17 +179,9 @@ def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class SelectNode(UnaryNode): - column_ids: typing.Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class ProjectionNode(UnaryNode): - assignments: typing.Tuple[typing.Tuple[expressions.Expression, str], ...] + assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] def __hash__(self): return self._node_hash @@ -264,25 +248,6 @@ def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class AssignNode(UnaryNode): - source_id: str - destination_id: str - - def __hash__(self): - return self._node_hash - - -@dataclass(frozen=True) -class AssignConstantNode(UnaryNode): - destination_id: str - value: typing.Hashable - dtype: typing.Optional[bigframes.dtypes.Dtype] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class RandomSampleNode(UnaryNode): fraction: float diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 8032093b62..9f415f3bc4 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -136,7 +136,7 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project( + expr = value.project_to_id( ops.AsTypeOp("string").as_expr("col1"), output_id="col1" )._compile_ordered() assert value._compile_ordered().columns[0].type().is_int64() @@ -154,7 +154,9 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project(ops.add_op.as_expr("col2", "col3"), "col4")._compile_ordered() + expr = value.project_to_id( + ops.add_op.as_expr("col2", "col3"), "col4" + )._compile_ordered() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 @@ -173,7 +175,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project( + expr = value.project_to_id( ops.where_op.as_expr("col2", "col3", "col4"), "col5" )._compile_ordered() assert expr.columns[4].type().is_float64() From 5a6625454d9a147c8f922074a1dabe01f8a6fdaa Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 20:28:26 +0000 Subject: [PATCH 2/6] fix issues with constant expressions --- bigframes/core/compile/compiled.py | 7 ++++++- bigframes/core/compile/scalar_op_compiler.py | 9 ++++++--- bigframes/core/expression.py | 18 +++++++++++++++++- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d73808e732..2cab6fb95d 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -151,7 +151,12 @@ def projection( op_compiler.compile_expression(expression, bindings).name(id) for expression, id in expression_id_pairs ] - return self._select(tuple(values)) # type: ignore + result = self._select(tuple(values)) # type: ignore + + # Need to reproject to convert ibis Scalar to ibis Column object + if any(exp_id[0].is_const for exp_id in expression_id_pairs): + result = result._reproject_to_table() + return result @abc.abstractmethod def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 1b85192eb4..3d9e2688ab 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -65,11 +65,14 @@ def _( expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if pd.isnull(expression.value): # type: ignore + if pd.isnull(expression.value) and (expression.dtype is None): # type: ignore return ibis.null() - return ibis.literal( - expression.value, dtypes.bigframes_dtype_to_ibis_dtype(expression.dtype) + dtype = ( + None + if (expression.dtype is None) + else dtypes.bigframes_dtype_to_ibis_dtype(expression.dtype) ) + return ibis.literal(expression.value, dtype) @compile_expression.register def _( diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index e2f1c57856..8a589671fc 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -27,7 +27,7 @@ def const( value: typing.Hashable, dtype: Optional[bigframes.dtypes.Dtype] ) -> Expression: - return ScalarConstantExpression(value) + return ScalarConstantExpression(value, dtype) def free_var(id: str) -> Expression: @@ -42,6 +42,10 @@ class Expression(abc.ABC): def unbound_variables(self) -> typing.Tuple[str, ...]: return () + @abc.abstractproperty + def is_const(self) -> bool: + return False + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -51,6 +55,10 @@ class ScalarConstantExpression(Expression): value: typing.Hashable dtype: Optional[bigframes.dtypes.Dtype] = None + @property + def is_const(self) -> bool: + return True + @dataclasses.dataclass(frozen=True) class UnboundVariableExpression(Expression): @@ -62,6 +70,10 @@ class UnboundVariableExpression(Expression): def unbound_variables(self) -> typing.Tuple[str, ...]: return (self.id,) + @property + def is_const(self) -> bool: + return False + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): @@ -80,3 +92,7 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: map(lambda x: x.unbound_variables, self.inputs) ) ) + + @property + def is_const(self) -> bool: + return all(child.is_const for child in self.inputs) From c912e76f0c523f59e755b084387e7747b782b9fe Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 12 Jan 2024 13:55:37 -0800 Subject: [PATCH 3/6] refactor: remove 'partial' ops and replace with expressions (#314) --- bigframes/core/block_transforms.py | 155 ++++++++----------- bigframes/core/blocks.py | 125 +++++++-------- bigframes/core/compile/scalar_op_compiler.py | 42 +---- bigframes/core/expression.py | 14 ++ bigframes/core/groupby/__init__.py | 12 +- bigframes/core/indexers.py | 18 +-- bigframes/core/indexes/index.py | 23 +-- bigframes/core/reshape/__init__.py | 5 +- bigframes/dataframe.py | 95 +++++++----- bigframes/operations/__init__.py | 94 ++++------- bigframes/operations/base.py | 25 +-- bigframes/pandas/__init__.py | 12 +- bigframes/series.py | 32 ++-- 13 files changed, 285 insertions(+), 367 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 0b6886562e..345adb6be3 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows import bigframes.dtypes as dtypes @@ -44,11 +45,10 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: for lcol, rcol in zip(block1.value_columns, block2.value_columns): lcolmapped = lmap[lcol] rcolmapped = rmap[rcol] - joined_block, result_id = joined_block.apply_binary_op( - lcolmapped, rcolmapped, ops.eq_null_match_op - ) - joined_block, result_id = joined_block.apply_unary_op( - result_id, ops.partial_right(ops.fillna_op, False) + joined_block, result_id = joined_block.project_expr( + ops.fillna_op.as_expr( + ops.eq_null_match_op.as_expr(lcolmapped, rcolmapped), ex.const(False) + ) ) equality_ids.append(result_id) @@ -91,9 +91,8 @@ def indicate_duplicates( agg_ops.count_op, window_spec=window_spec, ) - block, duplicate_indicator = block.apply_unary_op( - val_count_col_id, - ops.partial_right(ops.gt_op, 1), + block, duplicate_indicator = block.project_expr( + ops.gt_op.as_expr(val_count_col_id, ex.const(1)) ) return ( block.drop_columns( @@ -183,8 +182,8 @@ def _interpolate_column( # Note, this method may block, notnull = block.apply_unary_op(column, ops.notnull_op) - block, masked_offsets = block.apply_binary_op( - x_values, notnull, ops.partial_arg3(ops.where_op, None) + block, masked_offsets = block.project_expr( + ops.where_op.as_expr(x_values, notnull, ex.const(None)) ) block, previous_value = block.apply_window_op( @@ -271,25 +270,22 @@ def _interpolate_points_nearest( xpredict_id: str, ) -> typing.Tuple[blocks.Block, str]: """Interpolate by taking the y value of the nearest x value""" - block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) - block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op) + left_diff = ops.sub_op.as_expr(xpredict_id, x0_id) + right_diff = ops.sub_op.as_expr(x1_id, xpredict_id) # If diffs equal, choose left - block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op) - block, choose_left = block.apply_unary_op( - choose_left, ops.partial_right(ops.fillna_op, False) + choose_left = ops.fillna_op.as_expr( + ops.le_op.as_expr(left_diff, right_diff), ex.const(False) ) - block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op) - - block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op) - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op) + nearest = ops.where_op.as_expr(y0_id, choose_left, y1_id) - block, prediction_id = block.apply_binary_op( - nearest, is_interpolation, ops.partial_arg3(ops.where_op, None) + is_interpolation = ops.and_op.as_expr( + ops.notnull_op.as_expr(y0_id), ops.notnull_op.as_expr(y1_id) ) - return block, prediction_id + return block.project_expr( + ops.where_op.as_expr(nearest, is_interpolation, ex.const(None)) + ) def _interpolate_points_ffill( @@ -302,11 +298,9 @@ def _interpolate_points_ffill( ) -> typing.Tuple[blocks.Block, str]: """Interpolates by using the preceding values""" # check for existance of y1, otherwise we are extrapolating instead of interpolating - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, prediction_id = block.apply_binary_op( - y0_id, y1_exists, ops.partial_arg3(ops.where_op, None) + return block.project_expr( + ops.where_op.as_expr(y0_id, ops.notnull_op.as_expr(y1_id), ex.const(None)) ) - return block, prediction_id def drop_duplicates( @@ -519,9 +513,7 @@ def nsmallest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -551,9 +543,7 @@ def nlargest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -641,7 +631,7 @@ def kurt( def _mean_delta_to_power( block: blocks.Block, - n_power, + n_power: int, column_ids: typing.Sequence[str], grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: @@ -649,11 +639,10 @@ def _mean_delta_to_power( window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] - cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): - block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) - block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns([delta_id]) + delta = ops.sub_op.as_expr(val_id, mean_val_id) + delta_power = ops.pow_op.as_expr(delta, ex.const(n_power)) + block, delta_power_id = block.project_expr(delta_power) delta_ids.append(delta_power_id) return block, delta_ids @@ -664,31 +653,26 @@ def _skew_from_moments_and_count( # Calculate skew using count, third moment and population variance # See G1 estimator: # https://en.wikipedia.org/wiki/Skewness#Sample_skewness - block, denominator_id = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 3 / 2) - ) - block, base_id = block.apply_binary_op(moment3_id, denominator_id, ops.div_op) - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, adjustment_id = block.apply_binary_op(count_id, countminus1_id, ops.mul_op) - block, adjustment_id = block.apply_unary_op( - adjustment_id, ops.partial_right(ops.unsafe_pow_op, 1 / 2) + moments_estimator = ops.div_op.as_expr( + moment3_id, ops.pow_op.as_expr(moment2_id, ex.const(3 / 2)) ) - block, adjustment_id = block.apply_binary_op( - adjustment_id, countminus2_id, ops.div_op + + countminus1 = ops.sub_op.as_expr(count_id, ex.const(1)) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) + adjustment = ops.div_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.mul_op.as_expr(count_id, countminus1), ex.const(1 / 2) + ), + countminus2, ) - block, skew_id = block.apply_binary_op(base_id, adjustment_id, ops.mul_op) + + skew = ops.mul_op.as_expr(moments_estimator, adjustment) # Need to produce NA if have less than 3 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 3)) - block, skew_id = block.apply_binary_op( - skew_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_skew = ops.where_op.as_expr( + skew, ops.ge_op.as_expr(count_id, ex.const(3)), ex.const(None) ) - return block, skew_id + return block.project_expr(cleaned_skew) def _kurt_from_moments_and_count( @@ -701,49 +685,42 @@ def _kurt_from_moments_and_count( # adjustment = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) # kurtosis = (numerator / denominator) - adjustment - # Numerator - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countplus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.add_op, 1) + numerator = ops.mul_op.as_expr( + moment4_id, + ops.mul_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), + ops.add_op.as_expr(count_id, ex.const(1)), + ), ) - block, num_adj = block.apply_binary_op(countplus1_id, countminus1_id, ops.mul_op) - block, numerator_id = block.apply_binary_op(moment4_id, num_adj, ops.mul_op) # Denominator - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, countminus3_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 3) - ) - block, denom_adj = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, popvar_squared = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 2) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) + countminus3 = ops.sub_op.as_expr(count_id, ex.const(3)) + + # Denominator + denominator = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr(moment2_id, ex.const(2)), + ops.mul_op.as_expr(countminus2, countminus3), ) - block, denominator_id = block.apply_binary_op(popvar_squared, denom_adj, ops.mul_op) # Adjustment - block, countminus1_square = block.apply_unary_op( - countminus1_id, ops.partial_right(ops.unsafe_pow_op, 2) - ) - block, adj_num = block.apply_unary_op( - countminus1_square, ops.partial_right(ops.mul_op, 3) + adj_num = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), ex.const(2) + ), + ex.const(3), ) - block, adj_denom = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, adjustment_id = block.apply_binary_op(adj_num, adj_denom, ops.div_op) + adj_denom = ops.mul_op.as_expr(countminus2, countminus3) + adjustment = ops.div_op.as_expr(adj_num, adj_denom) # Combine - block, base_id = block.apply_binary_op(numerator_id, denominator_id, ops.div_op) - block, kurt_id = block.apply_binary_op(base_id, adjustment_id, ops.sub_op) + kurt = ops.sub_op.as_expr(ops.div_op.as_expr(numerator, denominator), adjustment) # Need to produce NA if have less than 4 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 4)) - block, kurt_id = block.apply_binary_op( - kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_kurt = ops.where_op.as_expr( + kurt, ops.ge_op.as_expr(count_id, ex.const(4)), ex.const(None) ) - return block, kurt_id + return block.project_expr(cleaned_kurt) def align( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 73e3460ab7..e484148493 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -35,6 +35,7 @@ import bigframes._config.sampling_options as sampling_options import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.joins.name_resolution as join_names @@ -662,23 +663,32 @@ def with_index_labels(self, value: typing.Sequence[Label]) -> Block: index_labels=tuple(value), ) - def apply_unary_op( - self, column: str, op: ops.UnaryOp, result_label: Label = None + def project_expr( + self, expr: ex.Expression, label: Label = None ) -> typing.Tuple[Block, str]: """ - Apply a unary op to the block. Creates a new column to store the result. + Apply a scalar expression to the block. Creates a new column to store the result. """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() expr = self._expr.project_to_id(op.as_expr(column), result_id) block = Block( - expr, + array_val, index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], + column_labels=[*self.column_labels, label], index_labels=self.index.names, ) return (block, result_id) + def apply_unary_op( + self, column: str, op: ops.UnaryOp, result_label: Label = None + ) -> typing.Tuple[Block, str]: + """ + Apply a unary op to the block. Creates a new column to store the result. + """ + expr = op.as_expr(column) + return self.project_expr(expr, result_label) + def apply_binary_op( self, left_column_id: str, @@ -686,17 +696,8 @@ def apply_binary_op( op: ops.BinaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project_to_id( - op.as_expr(left_column_id, right_column_id), result_id - ) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(left_column_id, right_column_id) + return self.project_expr(expr, result_label) def apply_ternary_op( self, @@ -706,17 +707,8 @@ def apply_ternary_op( op: ops.TernaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project_to_id( - op.as_expr(col_id_1, col_id_2, col_id_3), result_id - ) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(col_id_1, col_id_2, col_id_3) + return self.project_expr(expr, result_label) def multi_apply_window_op( self, @@ -1154,43 +1146,37 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): conditions = [] if start != 0: if start > 0: - op = ops.partial_right(ops.ge_op, start) assert positive_offsets - block, start_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.ge_op.as_expr(positive_offsets, ex.const(start))) else: - op = ops.partial_right(ops.le_op, -start - 1) assert negative_offsets - block, start_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(start_cond) + conditions.append( + ops.le_op.as_expr(negative_offsets, ex.const(-start - 1)) + ) if stop is not None: if stop >= 0: - op = ops.partial_right(ops.lt_op, stop) assert positive_offsets - block, stop_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.lt_op.as_expr(positive_offsets, ex.const(stop))) else: - op = ops.partial_right(ops.gt_op, -stop - 1) assert negative_offsets - block, stop_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(stop_cond) - + conditions.append( + ops.gt_op.as_expr(negative_offsets, ex.const(-stop - 1)) + ) if step > 1: - op = ops.partial_right(ops.mod_op, step) if start >= 0: - op = ops.partial_right(ops.sub_op, start) assert positive_offsets - block, start_diff = block.apply_unary_op(positive_offsets, op) + start_diff = ops.sub_op.as_expr(positive_offsets, ex.const(start)) else: - op = ops.partial_right(ops.sub_op, -start + 1) assert negative_offsets - block, start_diff = block.apply_unary_op(negative_offsets, op) - modulo_op = ops.partial_right(ops.mod_op, step) - block, mod = block.apply_unary_op(start_diff, modulo_op) - is_zero_op = ops.partial_right(ops.eq_op, 0) - block, step_cond = block.apply_unary_op(mod, is_zero_op) + start_diff = ops.sub_op.as_expr(negative_offsets, ex.const(-start + 1)) + step_cond = ops.eq_op.as_expr( + ops.mod_op.as_expr(start_diff, ex.const(step)), ex.const(0) + ) conditions.append(step_cond) for cond in conditions: - block = block.filter(cond) + block, cond_id = block.project_expr(cond) + block = block.filter(cond_id) return block.select_columns(self.value_columns) @@ -1240,13 +1226,12 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_to_id( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), - output_id=index_col, + add_prefix = ops.add_op.as_expr( + ex.const(prefix), ops.AsTypeOp(to_type="string").as_expr(index_col) ) - prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) expr = expr.project_to_id( - expression=prefix_op.as_expr(index_col), output_id=index_col + expression=add_prefix, + output_id=index_col, ) return Block( expr, @@ -1264,13 +1249,12 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_to_id( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), - output_id=index_col, + add_suffix = ops.add_op.as_expr( + ops.AsTypeOp(to_type="string").as_expr(index_col), ex.const(suffix) ) - prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) expr = expr.project_to_id( - expression=prefix_op.as_expr(index_col), output_id=index_col + expression=add_suffix, + output_id=index_col, ) return Block( expr, @@ -1470,28 +1454,23 @@ def _create_pivot_column_index( def _create_pivot_col( block: Block, columns: typing.Sequence[str], value_col: str, value ) -> typing.Tuple[Block, str]: - cond_id = "" + condition: typing.Optional[ex.Expression] = None nlevels = len(columns) for i in range(len(columns)): uvalue_level = value[i] if nlevels > 1 else value if pd.isna(uvalue_level): - block, eq_id = block.apply_unary_op( - columns[i], - ops.isnull_op, - ) + equality = ops.isnull_op.as_expr(columns[i]) else: - block, eq_id = block.apply_unary_op( - columns[i], ops.partial_right(ops.eq_op, uvalue_level) - ) - if cond_id: - block, cond_id = block.apply_binary_op(eq_id, cond_id, ops.and_op) + equality = ops.eq_op.as_expr(columns[i], ex.const(uvalue_level)) + if condition is not None: + condition = ops.and_op.as_expr(equality, condition) else: - cond_id = eq_id - block, masked_id = block.apply_binary_op( - value_col, cond_id, ops.partial_arg3(ops.where_op, None) - ) + condition = equality - return block, masked_id + assert condition is not None + return block.project_expr( + ops.where_op.as_expr(value_col, condition, ex.const(None)) + ) def _get_unique_values( self, columns: Sequence[str], max_unique_values: int diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 3d9e2688ab..1943eee96e 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -28,7 +28,6 @@ import bigframes.constants as constants import bigframes.core.expression as ex import bigframes.dtypes -import bigframes.dtypes as dtypes import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) @@ -65,14 +64,7 @@ def _( expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if pd.isnull(expression.value) and (expression.dtype is None): # type: ignore - return ibis.null() - dtype = ( - None - if (expression.dtype is None) - else dtypes.bigframes_dtype_to_ibis_dtype(expression.dtype) - ) - return ibis.literal(expression.value, dtype) + return bigframes.dtypes.literal_to_ibis_scalar(expression.value, expression.dtype) @compile_expression.register def _( @@ -1139,38 +1131,6 @@ def clip_op( ) -# Composition Ops -@scalar_op_compiler.register_unary_op(ops.ApplyRight, pass_op=True) -def apply_right(input: ibis_types.Value, op: ops.ApplyRight): - right = dtypes.literal_to_ibis_scalar(op.right_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input, right)) - - -@scalar_op_compiler.register_unary_op(ops.ApplyLeft, pass_op=True) -def apply_left(input: ibis_types.Value, op: ops.ApplyLeft): - left = dtypes.literal_to_ibis_scalar(op.left_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (left, input)) - - -@scalar_op_compiler.register_binary_op(ops.ReverseArgsOp, pass_op=True) -def apply_reversed( - input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ReverseArgsOp -): - return scalar_op_compiler.compile_row_op(op.base_op, (input2, input1)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg1, pass_op=True) -def apply_arg1(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg1): - arg1 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (arg1, input1, input2)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg3, pass_op=True) -def apply_arg3(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg3): - arg3 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input1, input2, arg3)) - - # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 8a589671fc..400c3a5510 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -42,6 +42,9 @@ class Expression(abc.ABC): def unbound_variables(self) -> typing.Tuple[str, ...]: return () + def rename(self, name_mapping: dict[str, str]) -> Expression: + return self + @abc.abstractproperty def is_const(self) -> bool: return False @@ -70,6 +73,12 @@ class UnboundVariableExpression(Expression): def unbound_variables(self) -> typing.Tuple[str, ...]: return (self.id,) + def rename(self, name_mapping: dict[str, str]) -> Expression: + if self.id in name_mapping: + return UnboundVariableExpression(name_mapping[self.id]) + else: + return self + @property def is_const(self) -> bool: return False @@ -93,6 +102,11 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: ) ) + def rename(self, name_mapping: dict[str, str]) -> Expression: + return OpExpression( + self.op, tuple(input.rename(name_mapping) for input in self.inputs) + ) + @property def is_const(self) -> bool: return all(child.is_const for child in self.inputs) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 66ba901649..ab6b15e7b9 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -28,7 +28,6 @@ import bigframes.core.window as windows import bigframes.dataframe as df import bigframes.dtypes as dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -540,10 +539,13 @@ def cummin(self, *args, **kwargs) -> series.Series: ) def cumcount(self, *args, **kwargs) -> series.Series: - return self._apply_window_op( - agg_ops.rank_op, - discard_name=True, - )._apply_unary_op(ops.partial_right(ops.sub_op, 1)) + return ( + self._apply_window_op( + agg_ops.rank_op, + discard_name=True, + ) + - 1 + ) def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 12a1303d29..6998d0e974 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -22,6 +22,7 @@ import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar @@ -63,17 +64,14 @@ def __setitem__(self, key, value) -> None: index_column = block.index_columns[0] # if index == key return value else value_colum - block, insert_cond = block.apply_unary_op( - index_column, ops.partial_right(ops.eq_op, key) - ) - block, result_id = block.apply_binary_op( - insert_cond, - self._series._value_column, - ops.partial_arg1(ops.where_op, value), - ) - block = block.copy_values(result_id, value_column).drop_columns( - [insert_cond, result_id] + block, result_id = block.project_expr( + ops.where_op.as_expr( + ex.const(value), + ops.eq_op.as_expr(index_column, ex.const(key)), + self._series._value_column, + ) ) + block = block.copy_values(result_id, value_column).drop_columns([result_id]) self._series._set_block(block) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 18d8d24a99..4ec11cb163 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,6 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -186,7 +187,7 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) + return self._apply_unary_expr(ops.AsTypeOp(to_type=dtype).as_expr("arg")) def all(self) -> bool: if self.nlevels > 1: @@ -261,7 +262,7 @@ def value_counts( def fillna(self, value=None) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'fillna'") - return self._apply_unary_op(ops.partial_right(ops.fillna_op, value)) + return self._apply_unary_expr(ops.fillna_op.as_expr("arg", ex.const(value))) def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) @@ -284,8 +285,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, labels) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(labels)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -308,19 +309,23 @@ def isin(self, values) -> Index: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op( - ops.IsInOp(values=tuple(values), match_nulls=True) + return self._apply_unary_expr( + ops.IsInOp(values=tuple(values), match_nulls=True).as_expr("arg") ).fillna(value=False) - def _apply_unary_op( + def _apply_unary_expr( self, - op: ops.UnaryOp, + op: ex.Expression, ) -> Index: """Applies a unary operator to the index.""" + if len(op.unbound_variables) != 1: + raise ValueError("Expression must have exactly 1 unbound variable.") + unbound_variable = op.unbound_variables[0] + block = self._block result_ids = [] for col in self._block.index_columns: - block, result_id = block.apply_unary_op(col, op) + block, result_id = block.project_expr(op.rename({unbound_variable: col})) result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index d9cc99a036..cadd8e5145 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe @@ -165,7 +166,7 @@ def qcut( ordering=(order.OrderingColumnReference(x._value_column),), ), ) - block, result = block.apply_binary_op( - result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + block, result = block.project_expr( + ops.where_op.as_expr(result, nullity_id, ex.const(None)), label=label ) return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f039904f0..1288117395 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -47,6 +47,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid import bigframes.core.indexers as indexers @@ -656,25 +657,34 @@ def _apply_binop( op, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ): if isinstance(other, (float, int)): - return self._apply_scalar_binop(other, op) + return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop(other, op, axis=axis, how=how) + return self._apply_series_binop( + other, op, axis=axis, how=how, reverse=reverse + ) elif isinstance(other, DataFrame): - return self._apply_dataframe_binop(other, op, how=how) + return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" ) - def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: + def _apply_scalar_binop( + self, other: float | int, op: ops.BinaryOp, reverse: bool = False + ) -> DataFrame: block = self._block - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_unary_op(column_id, partial_op, result_label=label) + expr = ( + op.as_expr(ex.const(other), column_id) + if reverse + else op.as_expr(column_id, ex.const(other)) + ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([column_id]) return DataFrame(block) @@ -684,6 +694,7 @@ def _apply_series_binop( op: ops.BinaryOp, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ) -> DataFrame: if axis not in ("columns", "index", 0, 1): raise ValueError(f"Invalid input: axis {axis}.") @@ -703,12 +714,13 @@ def _apply_series_binop( for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_binary_op( - get_column_left[column_id], - series_col, - op, - result_label=label, + self_col = get_column_left[column_id] + expr = ( + op.as_expr(series_col, self_col) + if reverse + else op.as_expr(self_col, series_col) ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([get_column_left[column_id]]) block = block.drop_columns([series_col]) @@ -716,7 +728,11 @@ def _apply_series_binop( return DataFrame(block) def _apply_dataframe_binop( - self, other: DataFrame, op: ops.BinaryOp, how: str = "outer" + self, + other: DataFrame, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, ) -> DataFrame: # Join rows joined_index, (get_column_left, get_column_right) = self._block.index.join( @@ -738,31 +754,32 @@ def _apply_dataframe_binop( for left_index, right_index in column_indices: if left_index >= 0 and right_index >= 0: # -1 indices indicate missing - left_col_id = self._block.value_columns[left_index] - right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_binary_op( - get_column_left[left_col_id], - get_column_right[right_col_id], - op, + self_col_id = get_column_left[self._block.value_columns[left_index]] + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, self_col_id) + if reverse + else op.as_expr(self_col_id, other_col_id) ) - binop_result_ids.append(result_col_id) elif left_index >= 0: - left_col_id = self._block.value_columns[left_index] - block, result_col_id = block.apply_unary_op( - get_column_left[left_col_id], - ops.partial_right(op, None), + self_col_id = get_column_left[self._block.value_columns[left_index]] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) ) - binop_result_ids.append(result_col_id) elif right_index >= 0: - right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_unary_op( - get_column_right[right_col_id], - ops.partial_left(op, None), + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, ex.const(None)) + if reverse + else op.as_expr(ex.const(None), other_col_id) ) - binop_result_ids.append(result_col_id) else: # Should not be possible raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) block = block.select_columns(binop_result_ids).with_column_labels(columns) return DataFrame(block) @@ -822,7 +839,7 @@ def rsub( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.sub_op), axis=axis) + return self._apply_binop(other, ops.sub_op, axis=axis, reverse=True) __rsub__ = rsub @@ -849,7 +866,7 @@ def rtruediv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.div_op), axis=axis) + return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) __rtruediv__ = rdiv = rtruediv @@ -867,7 +884,7 @@ def rfloordiv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.floordiv_op), axis=axis) + return self._apply_binop(other, ops.floordiv_op, axis=axis, reverse=True) __rfloordiv__ = rfloordiv @@ -875,7 +892,7 @@ def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int return self._apply_binop(other, ops.mod_op, axis=axis) def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore - return self._apply_binop(other, ops.reverse(ops.mod_op), axis=axis) + return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) __mod__ = mod @@ -889,7 +906,7 @@ def pow( def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.pow_op), axis=axis) + return self._apply_binop(other, ops.pow_op, axis=axis, reverse=True) __pow__ = pow @@ -1101,8 +1118,8 @@ def drop( condition_id = None for i, idx in enumerate(index): level_id = self._resolve_levels(i)[0] - block, condition_id_cur = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, idx) + block, condition_id_cur = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(idx)) ) if condition_id: block, condition_id = block.apply_binary_op( @@ -1122,8 +1139,8 @@ def drop( elif isinstance(index, indexes.Index): return self._drop_by_index(index) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True).select_columns( self._block.value_columns @@ -3031,7 +3048,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binop(inputs[1], binop) else: - return self._apply_binop(inputs[0], ops.reverse(binop)) + return self._apply_binop(inputs[0], binop, reverse=True) return NotImplemented diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 39ca52394e..9737df94f9 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -49,11 +49,13 @@ def name(self) -> str: def arguments(self) -> int: return 1 - def as_expr(self, input_id: str) -> bigframes.core.expression.Expression: + def as_expr( + self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" + ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( - self, (bigframes.core.expression.UnboundVariableExpression(input_id),) + self, (_convert_expr_input(input_id),) ) @@ -68,15 +70,17 @@ def arguments(self) -> int: return 2 def as_expr( - self, left_input: str, right_input: str + self, + left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + right_input: typing.Union[str, bigframes.core.expression.Expression] = "arg2", ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( self, ( - bigframes.core.expression.UnboundVariableExpression(left_input), - bigframes.core.expression.UnboundVariableExpression(right_input), + _convert_expr_input(left_input), + _convert_expr_input(right_input), ), ) @@ -92,20 +96,35 @@ def arguments(self) -> int: return 3 def as_expr( - self, input1: str, input2: str, input3: str + self, + input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + input2: typing.Union[str, bigframes.core.expression.Expression] = "arg2", + input3: typing.Union[str, bigframes.core.expression.Expression] = "arg3", ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( self, ( - bigframes.core.expression.UnboundVariableExpression(input1), - bigframes.core.expression.UnboundVariableExpression(input2), - bigframes.core.expression.UnboundVariableExpression(input3), + _convert_expr_input(input1), + _convert_expr_input(input2), + _convert_expr_input(input3), ), ) +def _convert_expr_input( + input: typing.Union[str, bigframes.core.expression.Expression] +) -> bigframes.core.expression.Expression: + """Allows creating free variables with just a string""" + import bigframes.core.expression + + if isinstance(input, str): + return bigframes.core.expression.UnboundVariableExpression(input) + else: + return input + + # Operation Factories def create_unary_op(name: str) -> UnaryOp: return dataclasses.make_dataclass( @@ -309,63 +328,6 @@ class MapOp(UnaryOp): mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...] -# Operation Composition -# Meta-ops that do partial application or parameter remapping -# Subject to change, may convert to explicit tree -@dataclasses.dataclass(frozen=True) -class ApplyRight(UnaryOp): - name: typing.ClassVar[str] = "apply_right" - base_op: BinaryOp - right_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyLeft(UnaryOp): - name: typing.ClassVar[str] = "apply_left" - base_op: BinaryOp - left_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg1(BinaryOp): - name: typing.ClassVar[str] = "apply_arg1" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg3(BinaryOp): - name: typing.ClassVar[str] = "apply_arg3" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ReverseArgsOp(BinaryOp): - name: typing.ClassVar[str] = "apply_reverse" - base_op: BinaryOp - - -def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyLeft(base_op=op, left_scalar=scalar) - - -def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyRight(base_op=op, right_scalar=scalar) - - -def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg1(base_op=op, scalar=scalar) - - -def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg3(base_op=op, scalar=scalar) - - -def reverse(op: BinaryOp) -> BinaryOp: - return ReverseArgsOp(base_op=op) - - # Binary Ops fillna_op = create_binary_op(name="fillna") cliplower_op = create_binary_op(name="clip_lower") diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 8989255f7e..077815a9d6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops @@ -136,6 +137,7 @@ def _apply_binary_op( other: typing.Any, op: ops.BinaryOp, alignment: typing.Literal["outer", "left"] = "outer", + reverse: bool = False, ) -> series.Series: """Applies a binary operator to the series and other.""" if isinstance(other, pd.Series): @@ -144,11 +146,7 @@ def _apply_binary_op( f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): - (left, right, block) = self._align(other, how=alignment) - - block, result_id = block.apply_binary_op( - left, right, op, self._value_column - ) + (self_col, other_col, block) = self._align(other, how=alignment) name = self._name if ( @@ -157,13 +155,20 @@ def _apply_binary_op( and alignment == "outer" ): name = None - - return series.Series( - block.select_column(result_id).assign_label(result_id, name) + expr = op.as_expr( + other_col if reverse else self_col, self_col if reverse else other_col ) + block, result_id = block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) + else: - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) - return self._apply_unary_op(partial_op) + name = self._name + expr = op.as_expr( + ex.const(other) if reverse else self._value_column, + self._value_column if reverse else ex.const(other), + ) + block, result_id = self._block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) def _apply_corr_aggregation(self, other: series.Series) -> float: (left, right, block) = self._align(other, how="outer") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0f060a23e8..554acda202 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -48,6 +48,7 @@ import bigframes._config as config import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape @@ -294,14 +295,13 @@ def _perform_get_dummies_block_operations( new_column_label = f"{column_label}{value}" if column_label == "": new_column_label = value - new_block, new_id = block.apply_unary_op( - column_id, ops.ApplyLeft(ops.eq_op, value) + new_block, new_id = block.project_expr( + ops.eq_op.as_expr(column_id, ex.const(value)) ) intermediate_col_ids.append(new_id) - block, _ = new_block.apply_unary_op( - new_id, - ops.ApplyRight(ops.fillna_op, False), - result_label=new_column_label, + block, _ = new_block.project_expr( + ops.fillna_op.as_expr(new_id, ex.const(False)), + label=new_column_label, ) if dummy_na: # dummy column name for na depends on the dtype diff --git a/bigframes/series.py b/bigframes/series.py index 7a4600a324..2371aad780 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -34,6 +34,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes @@ -188,8 +189,8 @@ def rename( # Will throw if value type isn't compatible with index type. block, const_id = block.create_constant(v, dtype=idx_dtype) - block, cond_id = block.apply_unary_op( - idx_id, ops.ApplyRight(base_op=ops.ne_op, right_scalar=k) + block, cond_id = block.project_expr( + ops.ne_op.as_expr(idx_id, ex.const(k)) ) block, new_idx_id = block.apply_ternary_op( idx_id, cond_id, const_id, ops.where_op @@ -342,8 +343,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -488,11 +489,8 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(tuple(to_replace_list)) ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = block.project_expr( + ops.where_op.as_expr(ex.const(value), cond, self._value_column), self.name ) return Series(block.select_column(result_col)) @@ -605,7 +603,7 @@ def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) def radd(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.add_op)) + return self._apply_binary_op(other, ops.add_op, reverse=True) def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) @@ -617,7 +615,7 @@ def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) def rsub(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.sub_op)) + return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub @@ -631,7 +629,7 @@ def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) def rmul(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.mul_op)) + return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul @@ -645,7 +643,7 @@ def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.div_op)) + return self._apply_binary_op(other, ops.div_op, reverse=True) div = truediv @@ -663,7 +661,7 @@ def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) def rfloordiv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.floordiv_op)) + return self._apply_binary_op(other, ops.floordiv_op, reverse=True) def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) @@ -675,7 +673,7 @@ def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) def rpow(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.pow_op)) + return self._apply_binary_op(other, ops.pow_op, reverse=True) def __lt__(self, other: float | int | Series) -> Series: # type: ignore return self.lt(other) @@ -711,7 +709,7 @@ def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) def rmod(self, other) -> Series: # type: ignore - return self._apply_binary_op(other, ops.reverse(ops.mod_op)) + return self._apply_binary_op(other, ops.mod_op, reverse=True) def divmod(self, other) -> Tuple[Series, Series]: # type: ignore # TODO(huanc): when self and other both has dtype int and other contains zeros, @@ -1503,7 +1501,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binary_op(inputs[1], binop) else: - return self._apply_binary_op(inputs[0], ops.reverse(binop)) + return self._apply_binary_op(inputs[0], binop, reverse=True) return NotImplemented From 236e088744e23b924c5b7a3c6b6bbd79a6ed7f42 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 22:13:40 +0000 Subject: [PATCH 4/6] fix typed null constant issue --- bigframes/core/compile/scalar_op_compiler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 1943eee96e..e2a9adf4c4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -64,7 +64,13 @@ def _( expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: +<<<<<<< HEAD return bigframes.dtypes.literal_to_ibis_scalar(expression.value, expression.dtype) +======= + return bigframes.dtypes.literal_to_ibis_scalar( + expression.value, expression.dtype + ) +>>>>>>> 2442c5cc (fix typed null constant issue) @compile_expression.register def _( From 070ffb70de33fa7e3a703066e72c47e82bdf37d1 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 22:27:01 +0000 Subject: [PATCH 5/6] fix merge bug --- bigframes/core/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 76363c5c69..d4b048420c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -671,7 +671,7 @@ def project_expr( """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - array_val = self._expr.project(expr, result_id) + array_val = self._expr.project_to_id(expr, result_id) block = Block( array_val, index_columns=self.index_columns, From 5b53312b51f50ebbf8fa231cb2265a30e79e6a12 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 13 Jan 2024 00:11:35 +0000 Subject: [PATCH 6/6] fix prefix/suffix ops --- bigframes/core/blocks.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d4b048420c..8c59f8106b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1227,7 +1227,10 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: expr = self._expr for index_col in self._index_columns: expr = expr.project_to_id( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), + expression=ops.add_op.as_expr( + ex.const(prefix), + ops.AsTypeOp(to_type="string").as_expr(index_col), + ), output_id=index_col, ) return Block( @@ -1247,7 +1250,10 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: expr = self._expr for index_col in self._index_columns: expr = expr.project_to_id( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), + expression=ops.add_op.as_expr( + ops.AsTypeOp(to_type="string").as_expr(index_col), + ex.const(suffix), + ), output_id=index_col, ) return Block(