diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 7ff23efad3..489afdcb51 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -23,6 +23,7 @@ import bigframes.core.compile.compiled as compiled import bigframes.core.compile.compiler as compiler +import bigframes.core.expression as expressions import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -30,7 +31,6 @@ import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.session._io.bigquery @@ -152,48 +152,10 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> ArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" + def project(self, expression: expressions.Expression, output_id: str): return ArrayValue( - nodes.ProjectRowOpNode( - child=self.node, input_ids=(column_name,), op=op, output_id=output_name - ) - ) - - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> ArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - return ArrayValue( - nodes.ProjectRowOpNode( - child=self.node, - input_ids=(left_column_id, right_column_id), - op=op, - output_id=output_column_id, - ) - ) - - def project_ternary_op( - self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> ArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - return ArrayValue( - nodes.ProjectRowOpNode( - child=self.node, - input_ids=(col_id_1, col_id_2, col_id_3), - op=op, - output_id=output_column_id, + nodes.ProjectionNode( + child=self.node, assignments=((expression, output_id),) ) ) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e97e8fd01c..57a497d2cb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -669,7 +669,7 @@ def apply_unary_op( """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - expr = self._expr.project_unary_op(column, op, result_id) + expr = self._expr.project(op.as_expr(column), result_id) block = Block( expr, index_columns=self.index_columns, @@ -686,8 +686,8 @@ def apply_binary_op( result_label: Label = None, ) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() - expr = self._expr.project_binary_op( - left_column_id, right_column_id, op, result_id + expr = self._expr.project( + op.as_expr(left_column_id, right_column_id), result_id ) block = Block( expr, @@ -706,9 +706,7 @@ def apply_ternary_op( result_label: Label = None, ) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() - expr = self._expr.project_ternary_op( - col_id_1, col_id_2, col_id_3, op, result_id - ) + expr = self._expr.project(op.as_expr(col_id_1, col_id_2, col_id_3), result_id) block = Block( expr, index_columns=self.index_columns, @@ -1240,9 +1238,14 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + expr = expr.project( + expression=ops.AsTypeOp(to_type="string").as_expr(index_col), + output_id=index_col, + ) prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) - expr = expr.project_unary_op(index_col, prefix_op) + expr = expr.project( + expression=prefix_op.as_expr(index_col), output_id=index_col + ) return Block( expr, index_columns=self.index_columns, @@ -1259,9 +1262,14 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + expr = expr.project( + expression=ops.AsTypeOp(to_type="string").as_expr(index_col), + output_id=index_col, + ) prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) - expr = expr.project_unary_op(index_col, prefix_op) + expr = expr.project( + expression=prefix_op.as_expr(index_col), output_id=index_col + ) return Block( expr, index_columns=self.index_columns, @@ -1568,10 +1576,10 @@ def merge( coalesced_ids = [] for left_id, right_id in zip(left_join_ids, right_join_ids): coalesced_id = guid.generate_guid() - joined_expr = joined_expr.project_binary_op( - get_column_left[left_id], - get_column_right[right_id], - ops.coalesce_op, + joined_expr = joined_expr.project( + ops.coalesce_op.as_expr( + get_column_left[left_id], get_column_right[right_id] + ), coalesced_id, ) coalesced_ids.append(coalesced_id) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index c1e8f1ea48..6a3db3f2bd 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -28,6 +28,7 @@ import bigframes.constants as constants import bigframes.core.compile.scalar_op_compiler as op_compilers +import bigframes.core.expression as expressions import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -151,18 +152,19 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_row_op( + def project_expression( self: T, - input_column_ids: typing.Sequence[str], - op: ops.RowOp, + expression: expressions.Expression, output_column_id: typing.Optional[str] = None, ) -> T: - """Creates a new expression based on this expression with unary operation applied to one column.""" + """Apply an expression to the ArrayValue and assign the output to a column.""" result_id = ( - output_column_id or input_column_ids[0] + output_column_id or expression.unbound_variables[0] ) # overwrite input if not output id provided - inputs = tuple(self._get_ibis_column(col) for col in input_column_ids) - value = op_compiler.compile_row_op(op, inputs).name(result_id) + bindings = { + col: self._get_ibis_column(col) for col in expression.unbound_variables + } + value = op_compiler.compile_expression(expression, bindings).name(result_id) return self._set_or_replace_by_id(result_id, value) def assign(self: T, source_id: str, destination_id: str) -> T: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index c28958a861..eacee26bcb 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -143,10 +143,11 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register -def compile_project(node: nodes.ProjectRowOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_row_op( - node.input_ids, node.op, node.output_id - ) +def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): + result = compile_node(node.child, ordered) + for expr, id in node.assignments: + result = result.project_expression(expr, id) + return result @_compile_node.register diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d711dbf456..2331d3aa28 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,6 +26,7 @@ import pandas as pd import bigframes.constants as constants +import bigframes.core.expression as expressions import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations as ops @@ -50,6 +51,45 @@ class ScalarOpCompiler: ], ] = {} + @functools.singledispatchmethod + def compile_expression( + self, + expression: expressions.Expression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + raise NotImplementedError(f"Unrecognized expression: {expression}") + + @compile_expression.register + def _( + self, + expression: expressions.ScalarConstantExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + return ibis.literal(expression.value) + + @compile_expression.register + def _( + self, + expression: expressions.UnboundVariableExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + if expression.id not in bindings: + raise ValueError(f"Could not resolve unbound variable {expression.id}") + else: + return bindings[expression.id] + + @compile_expression.register + def _( + self, + expression: expressions.OpExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + inputs = [ + self.compile_expression(sub_expr, bindings) + for sub_expr in expression.inputs + ] + return self.compile_row_op(expression.op, inputs) + def compile_row_op( self, op: ops.RowOp, inputs: typing.Sequence[ibis_types.Value] ) -> ibis_types.Value: diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py new file mode 100644 index 0000000000..2fb1ccb988 --- /dev/null +++ b/bigframes/core/expression.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +import dataclasses +import itertools +import typing + +import bigframes.operations + + +@dataclasses.dataclass(frozen=True) +class Expression(abc.ABC): + """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return () + + +@dataclasses.dataclass(frozen=True) +class ScalarConstantExpression(Expression): + """An expression representing a scalar constant.""" + + # TODO: Further constrain? + value: typing.Hashable + + +@dataclasses.dataclass(frozen=True) +class UnboundVariableExpression(Expression): + """A variable expression representing an unbound variable.""" + + id: str + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return (self.id,) + + +@dataclasses.dataclass(frozen=True) +class OpExpression(Expression): + """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" + + op: bigframes.operations.RowOp + inputs: typing.Tuple[Expression, ...] + + def __post_init__(self): + assert self.op.arguments == len(self.inputs) + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return tuple( + itertools.chain.from_iterable( + map(lambda x: x.unbound_variables, self.inputs) + ) + ) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index f148759f61..6602170b5f 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -604,8 +604,8 @@ def coalesce_columns( expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = bigframes.core.guid.generate_guid() - expr = expr.project_binary_op( - left_id, right_id, ops.coalesce_op, coalesced_id + expr = expr.project( + ops.coalesce_op.as_expr(left_id, right_id), coalesced_id ) expr = expr.drop_columns([left_id, right_id]) result_ids.append(coalesced_id) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 5385852432..360f853e3e 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -17,15 +17,15 @@ from dataclasses import dataclass, field, fields import functools import typing -from typing import Optional, Tuple +from typing import Tuple import pandas +import bigframes.core.expression as expressions import bigframes.core.guid from bigframes.core.ordering import OrderingColumnReference import bigframes.core.window_spec as window import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: @@ -196,10 +196,8 @@ def __hash__(self): @dataclass(frozen=True) -class ProjectRowOpNode(UnaryNode): - input_ids: typing.Tuple[str, ...] - op: ops.RowOp - output_id: Optional[str] = None +class ProjectionNode(UnaryNode): + assignments: typing.Tuple[typing.Tuple[expressions.Expression, str], ...] def __hash__(self): return self._node_hash diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 3ef551e453..39ca52394e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -21,6 +21,10 @@ import bigframes.dtypes as dtypes +if typing.TYPE_CHECKING: + # Avoids circular dependency + import bigframes.core.expression + class RowOp(typing.Protocol): @property @@ -45,6 +49,13 @@ def name(self) -> str: def arguments(self) -> int: return 1 + def as_expr(self, input_id: str) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, (bigframes.core.expression.UnboundVariableExpression(input_id),) + ) + @dataclasses.dataclass(frozen=True) class BinaryOp: @@ -56,6 +67,19 @@ def name(self) -> str: def arguments(self) -> int: return 2 + def as_expr( + self, left_input: str, right_input: str + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, + ( + bigframes.core.expression.UnboundVariableExpression(left_input), + bigframes.core.expression.UnboundVariableExpression(right_input), + ), + ) + @dataclasses.dataclass(frozen=True) class TernaryOp: @@ -67,6 +91,20 @@ def name(self) -> str: def arguments(self) -> int: return 3 + def as_expr( + self, input1: str, input2: str, input3: str + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, + ( + bigframes.core.expression.UnboundVariableExpression(input1), + bigframes.core.expression.UnboundVariableExpression(input2), + bigframes.core.expression.UnboundVariableExpression(input3), + ), + ) + # Operation Factories def create_unary_op(name: str) -> UnaryOp: diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 623448b3aa..8032093b62 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -136,7 +136,9 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered() + expr = value.project( + ops.AsTypeOp("string").as_expr("col1"), output_id="col1" + )._compile_ordered() assert value._compile_ordered().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,9 +154,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op( - "col2", "col3", ops.add_op, "col4" - )._compile_ordered() + expr = value.project(ops.add_op.as_expr("col2", "col3"), "col4")._compile_ordered() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") assert len(expr.columns) == 4 @@ -173,8 +173,8 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_ternary_op( - "col2", "col3", "col4", ops.where_op, "col5" + expr = value.project( + ops.where_op.as_expr("col2", "col3", "col4"), "col5" )._compile_ordered() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered")