diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e32977fbce..8c08698b93 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -22,7 +22,7 @@ import pandas import bigframes.core.compile as compiling -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -114,12 +114,6 @@ def row_count(self) -> ArrayValue: return ArrayValue(nodes.RowCountNode(child=self.node)) # Operations - - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - return ArrayValue( - nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) - ) - def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" return ArrayValue( @@ -140,21 +134,104 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return ArrayValue( - nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) - ) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" return ArrayValue( nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def project(self, expression: expressions.Expression, output_id: str): + def project_to_id(self, expression: ex.Expression, output_id: str): + if output_id in self.column_ids: # Mutate case + exprs = [ + ((expression if (col_id == output_id) else ex.free_var(col_id)), col_id) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (expression, output_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def assign(self, source_id: str, destination_id: str) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.free_var(source_id) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.free_var(source_id), destination_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.const(value, dtype) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.const(value, dtype), destination_id)] + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), + ) + ) + + def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + selections = ((ex.free_var(col_id), col_id) for col_id in column_ids) + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(selections), + ) + ) + + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + new_projection = ( + (ex.free_var(col_id), col_id) + for col_id in self.column_ids + if col_id not in columns + ) return ArrayValue( nodes.ProjectionNode( - child=self.node, assignments=((expression, output_id),) + child=self.node, + assignments=tuple(new_projection), ) ) @@ -277,25 +354,6 @@ def unpivot( ) ) - def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return ArrayValue( - nodes.AssignNode( - child=self.node, source_id=source_id, destination_id=destination_id - ) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> ArrayValue: - return ArrayValue( - nodes.AssignConstantNode( - child=self.node, destination_id=destination_id, value=value, dtype=dtype - ) - ) - def join( self, self_column_ids: typing.Sequence[str], diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1960def0d5..8c59f8106b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -671,7 +671,7 @@ def project_expr( """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - array_val = self._expr.project(expr, result_id) + array_val = self._expr.project_to_id(expr, result_id) block = Block( array_val, index_columns=self.index_columns, @@ -1226,11 +1226,11 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - add_prefix = ops.add_op.as_expr( - ex.const(prefix), ops.AsTypeOp(to_type="string").as_expr(index_col) - ) - expr = expr.project( - expression=add_prefix, + expr = expr.project_to_id( + expression=ops.add_op.as_expr( + ex.const(prefix), + ops.AsTypeOp(to_type="string").as_expr(index_col), + ), output_id=index_col, ) return Block( @@ -1249,11 +1249,11 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - add_suffix = ops.add_op.as_expr( - ops.AsTypeOp(to_type="string").as_expr(index_col), ex.const(suffix) - ) - expr = expr.project( - expression=add_suffix, + expr = expr.project_to_id( + expression=ops.add_op.as_expr( + ops.AsTypeOp(to_type="string").as_expr(index_col), + ex.const(suffix), + ), output_id=index_col, ) return Block( @@ -1557,7 +1557,7 @@ def merge( coalesced_ids = [] for left_id, right_id in zip(left_join_ids, right_join_ids): coalesced_id = guid.generate_guid() - joined_expr = joined_expr.project( + joined_expr = joined_expr.project_to_id( ops.coalesce_op.as_expr( get_column_left[left_id], get_column_right[right_id] ), diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6a3db3f2bd..2cab6fb95d 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,9 +26,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants import bigframes.core.compile.scalar_op_compiler as op_compilers -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -96,16 +95,6 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: else None ) - @abc.abstractmethod - def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: - """Creates a new expression based on this expression with new columns.""" - ... - - def drop_columns(self: T, columns: Iterable[str]) -> T: - return self.select_columns( - [col for col in self.column_ids if col not in columns] - ) - @abc.abstractmethod def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" @@ -152,40 +141,26 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_expression( + def projection( self: T, - expression: expressions.Expression, - output_column_id: typing.Optional[str] = None, + expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], ) -> T: """Apply an expression to the ArrayValue and assign the output to a column.""" - result_id = ( - output_column_id or expression.unbound_variables[0] - ) # overwrite input if not output id provided - bindings = { - col: self._get_ibis_column(col) for col in expression.unbound_variables - } - value = op_compiler.compile_expression(expression, bindings).name(result_id) - return self._set_or_replace_by_id(result_id, value) + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} + values = [ + op_compiler.compile_expression(expression, bindings).name(id) + for expression, id in expression_id_pairs + ] + result = self._select(tuple(values)) # type: ignore - def assign(self: T, source_id: str, destination_id: str) -> T: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) + # Need to reproject to convert ibis Scalar to ibis Column object + if any(exp_id[0].is_const for exp_id in expression_id_pairs): + result = result._reproject_to_table() + return result - def assign_constant( - self: T, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> T: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() + @abc.abstractmethod + def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T: + ... @abc.abstractmethod def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: @@ -330,14 +305,6 @@ def _to_ibis_expr( table = table.filter(ibis.random() < ibis.literal(fraction)) return table - def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - builder = self.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) @@ -577,6 +544,11 @@ def _set_or_replace_by_id( builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> UnorderedIR: + builder = self.builder() + builder.columns = values + return builder.build() + def _reproject_to_table(self) -> UnorderedIR: """ Internal operators that projects the internal representation into a @@ -816,20 +788,6 @@ def promote_offsets(self, col_id: str) -> OrderedIR: ] return expr_builder.build() - def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - ## Methods that only work with ordering def project_window_op( self, @@ -1221,6 +1179,29 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> Ordered builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + ir = self + mappings = {value.name: value for value in values} + for ordering_id in ordering_col_ids: + # Drop case + if (ordering_id not in mappings) and (ordering_id in ir.column_ids): + # id is being dropped, hide it first + ir = ir._hide_column(ordering_id) + # Mutate case + elif (ordering_id in mappings) and not mappings[ordering_id].equals( + ir._get_any_column(ordering_id) + ): + ir = ir._hide_column(ordering_id) + + builder = ir.builder() + builder.columns = list(values) + return builder.build() + ## Ordering specific helpers def _get_any_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column. Will also get hidden columns.""" diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index eacee26bcb..18fcd73d19 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -79,16 +79,6 @@ def compile_join(node: nodes.JoinNode, ordered: bool = True): ) -@_compile_node.register -def compile_select(node: nodes.SelectNode, ordered: bool = True): - return compile_node(node.child, ordered).select_columns(node.column_ids) - - -@_compile_node.register -def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): - return compile_node(node.child, ordered).drop_columns(node.columns) - - @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) @@ -145,9 +135,7 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): result = compile_node(node.child, ordered) - for expr, id in node.assignments: - result = result.project_expression(expr, id) - return result + return result.projection(node.assignments) @_compile_node.register @@ -210,18 +198,6 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): ) -@_compile_node.register -def compile_assign(node: nodes.AssignNode, ordered: bool = True): - return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) - - -@_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): - return compile_node(node.child, ordered).assign_constant( - node.destination_id, node.value, node.dtype - ) - - @_compile_node.register def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 115242e722..bf0755acc7 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,7 +26,7 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.dtypes import bigframes.operations as ops @@ -53,7 +53,7 @@ class ScalarOpCompiler: @functools.singledispatchmethod def compile_expression( self, - expression: expressions.Expression, + expression: ex.Expression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: raise NotImplementedError(f"Unrecognized expression: {expression}") @@ -61,17 +61,17 @@ def compile_expression( @compile_expression.register def _( self, - expression: expressions.ScalarConstantExpression, + expression: ex.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if pd.isnull(expression.value): # type: ignore - return ibis.null() - return ibis.literal(expression.value) + return bigframes.dtypes.literal_to_ibis_scalar( + expression.value, expression.dtype + ) @compile_expression.register def _( self, - expression: expressions.UnboundVariableExpression, + expression: ex.UnboundVariableExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: if expression.id not in bindings: @@ -82,7 +82,7 @@ def _( @compile_expression.register def _( self, - expression: expressions.OpExpression, + expression: ex.OpExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: inputs = [ diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 59c3e595d9..540f9b6e5a 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -18,12 +18,16 @@ import dataclasses import itertools import typing +from typing import Optional +import bigframes.dtypes import bigframes.operations -def const(value: typing.Hashable) -> Expression: - return ScalarConstantExpression(value) +def const( + value: typing.Hashable, dtype: Optional[bigframes.dtypes.Dtype] = None +) -> Expression: + return ScalarConstantExpression(value, dtype) def free_var(id: str) -> Expression: @@ -41,6 +45,10 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: def rename(self, name_mapping: dict[str, str]) -> Expression: return self + @abc.abstractproperty + def is_const(self) -> bool: + return False + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -48,6 +56,11 @@ class ScalarConstantExpression(Expression): # TODO: Further constrain? value: typing.Hashable + dtype: Optional[bigframes.dtypes.Dtype] = None + + @property + def is_const(self) -> bool: + return True @dataclasses.dataclass(frozen=True) @@ -66,6 +79,10 @@ def rename(self, name_mapping: dict[str, str]) -> Expression: else: return self + @property + def is_const(self) -> bool: + return False + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): @@ -89,3 +106,7 @@ def rename(self, name_mapping: dict[str, str]) -> Expression: return OpExpression( self.op, tuple(input.rename(name_mapping) for input in self.inputs) ) + + @property + def is_const(self) -> bool: + return all(child.is_const for child in self.inputs) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index c02fdbb12d..4ec11cb163 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -609,7 +609,7 @@ def coalesce_columns( expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = bigframes.core.guid.generate_guid() - expr = expr.project( + expr = expr.project_to_id( ops.coalesce_op.as_expr(left_id, right_id), coalesced_id ) expr = expr.drop_columns([left_id, right_id]) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 360f853e3e..d30db9a7f7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -21,7 +21,7 @@ import pandas -import bigframes.core.expression as expressions +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingColumnReference import bigframes.core.window_spec as window @@ -145,14 +145,6 @@ def __hash__(self): # Unary nodes -@dataclass(frozen=True) -class DropColumnsNode(UnaryNode): - columns: Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): col_id: str @@ -187,17 +179,9 @@ def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class SelectNode(UnaryNode): - column_ids: typing.Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class ProjectionNode(UnaryNode): - assignments: typing.Tuple[typing.Tuple[expressions.Expression, str], ...] + assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] def __hash__(self): return self._node_hash @@ -264,25 +248,6 @@ def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class AssignNode(UnaryNode): - source_id: str - destination_id: str - - def __hash__(self): - return self._node_hash - - -@dataclass(frozen=True) -class AssignConstantNode(UnaryNode): - destination_id: str - value: typing.Hashable - dtype: typing.Optional[bigframes.dtypes.Dtype] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class RandomSampleNode(UnaryNode): fraction: float diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 8032093b62..9f415f3bc4 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -136,7 +136,7 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project( + expr = value.project_to_id( ops.AsTypeOp("string").as_expr("col1"), output_id="col1" )._compile_ordered() assert value._compile_ordered().columns[0].type().is_int64() @@ -154,7 +154,9 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project(ops.add_op.as_expr("col2", "col3"), "col4")._compile_ordered() + expr = value.project_to_id( + ops.add_op.as_expr("col2", "col3"), "col4" + )._compile_ordered() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 @@ -173,7 +175,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project( + expr = value.project_to_id( ops.where_op.as_expr("col2", "col3", "col4"), "col5" )._compile_ordered() assert expr.columns[4].type().is_float64()