googleapis · TrevorBergeron · Jan 16, 2024 · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024
@@ -22,7 +22,7 @@
 import pandas
 
 import bigframes.core.compile as compiling
-import bigframes.core.expression as expressions
+import bigframes.core.expression as ex
 import bigframes.core.guid
 import bigframes.core.nodes as nodes
 from bigframes.core.ordering import OrderingColumnReference
@@ -114,12 +114,6 @@ def row_count(self) -> ArrayValue:
         return ArrayValue(nodes.RowCountNode(child=self.node))
 
     # Operations
-
-    def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
-        return ArrayValue(
-            nodes.DropColumnsNode(child=self.node, columns=tuple(columns))
-        )
-
     def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue:
         """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression."""
         return ArrayValue(
@@ -140,21 +134,104 @@ def promote_offsets(self, col_id: str) -> ArrayValue:
         """
         return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id))
 
-    def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
-        return ArrayValue(
-            nodes.SelectNode(child=self.node, column_ids=tuple(column_ids))
-        )
-
     def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:
         """Append together multiple ArrayValue objects."""
         return ArrayValue(
             nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]]))
         )
 
-    def project(self, expression: expressions.Expression, output_id: str):
+    def project_to_id(self, expression: ex.Expression, output_id: str):
+        if output_id in self.column_ids:  # Mutate case
+            exprs = [
+                ((expression if (col_id == output_id) else ex.free_var(col_id)), col_id)
+                for col_id in self.column_ids
+            ]
+        else:  # append case
+            self_projection = (
+                (ex.free_var(col_id), col_id) for col_id in self.column_ids
+            )
+            exprs = [*self_projection, (expression, output_id)]
+        return ArrayValue(
+            nodes.ProjectionNode(
+                child=self.node,
+                assignments=tuple(exprs),
+            )
+        )
+
+    def assign(self, source_id: str, destination_id: str) -> ArrayValue:
+        if destination_id in self.column_ids:  # Mutate case
+            exprs = [
+                (
+                    (
+                        ex.free_var(source_id)
+                        if (col_id == destination_id)
+                        else ex.free_var(col_id)
+                    ),
+                    col_id,
+                )
+                for col_id in self.column_ids
+            ]
+        else:  # append case
+            self_projection = (
+                (ex.free_var(col_id), col_id) for col_id in self.column_ids
+            )
+            exprs = [*self_projection, (ex.free_var(source_id), destination_id)]
+        return ArrayValue(
+            nodes.ProjectionNode(
+                child=self.node,
+                assignments=tuple(exprs),
+            )
+        )
+
+    def assign_constant(
+        self,
+        destination_id: str,
+        value: typing.Any,
+        dtype: typing.Optional[bigframes.dtypes.Dtype],
+    ) -> ArrayValue:
+        if destination_id in self.column_ids:  # Mutate case
+            exprs = [
+                (
+                    (
+                        ex.const(value, dtype)
+                        if (col_id == destination_id)
+                        else ex.free_var(col_id)
+                    ),
+                    col_id,
+                )
+                for col_id in self.column_ids
+            ]
+        else:  # append case
+            self_projection = (
+                (ex.free_var(col_id), col_id) for col_id in self.column_ids
+            )
+            exprs = [*self_projection, (ex.const(value, dtype), destination_id)]
+        return ArrayValue(
+            nodes.ProjectionNode(
+                child=self.node,
+                assignments=tuple(exprs),
+            )
+        )
+
+    def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
+        selections = ((ex.free_var(col_id), col_id) for col_id in column_ids)
+        return ArrayValue(
+            nodes.ProjectionNode(
+                child=self.node,
+                assignments=tuple(selections),
+            )
+        )
+
+    def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
+        new_projection = (
+            (ex.free_var(col_id), col_id)
+            for col_id in self.column_ids
+            if col_id not in columns
+        )
         return ArrayValue(
             nodes.ProjectionNode(
-                child=self.node, assignments=((expression, output_id),)
+                child=self.node,
+                assignments=tuple(new_projection),
             )
         )
 
@@ -277,25 +354,6 @@ def unpivot(
             )
         )
 
-    def assign(self, source_id: str, destination_id: str) -> ArrayValue:
-        return ArrayValue(
-            nodes.AssignNode(
-                child=self.node, source_id=source_id, destination_id=destination_id
-            )
-        )
-
-    def assign_constant(
-        self,
-        destination_id: str,
-        value: typing.Any,
-        dtype: typing.Optional[bigframes.dtypes.Dtype],
-    ) -> ArrayValue:
-        return ArrayValue(
-            nodes.AssignConstantNode(
-                child=self.node, destination_id=destination_id, value=value, dtype=dtype
-            )
-        )
-
     def join(
         self,
         self_column_ids: typing.Sequence[str],

@@ -671,7 +671,7 @@ def project_expr(
         """
         # TODO(tbergeron): handle labels safely so callers don't need to
         result_id = guid.generate_guid()
-        array_val = self._expr.project(expr, result_id)
+        array_val = self._expr.project_to_id(expr, result_id)
         block = Block(
             array_val,
             index_columns=self.index_columns,
@@ -1226,11 +1226,11 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
-                add_prefix = ops.add_op.as_expr(
-                    ex.const(prefix), ops.AsTypeOp(to_type="string").as_expr(index_col)
-                )
-                expr = expr.project(
-                    expression=add_prefix,
+                expr = expr.project_to_id(
+                    expression=ops.add_op.as_expr(
+                        ex.const(prefix),
+                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                    ),
                     output_id=index_col,
                 )
             return Block(
@@ -1249,11 +1249,11 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
-                add_suffix = ops.add_op.as_expr(
-                    ops.AsTypeOp(to_type="string").as_expr(index_col), ex.const(suffix)
-                )
-                expr = expr.project(
-                    expression=add_suffix,
+                expr = expr.project_to_id(
+                    expression=ops.add_op.as_expr(
+                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                        ex.const(suffix),
+                    ),
                     output_id=index_col,
                 )
             return Block(
@@ -1557,7 +1557,7 @@ def merge(
         coalesced_ids = []
         for left_id, right_id in zip(left_join_ids, right_join_ids):
             coalesced_id = guid.generate_guid()
-            joined_expr = joined_expr.project(
+            joined_expr = joined_expr.project_to_id(
                 ops.coalesce_op.as_expr(
                     get_column_left[left_id], get_column_right[right_id]
                 ),

@@ -26,9 +26,8 @@
 import ibis.expr.types as ibis_types
 import pandas
 
-import bigframes.constants as constants
 import bigframes.core.compile.scalar_op_compiler as op_compilers
-import bigframes.core.expression as expressions
+import bigframes.core.expression as ex
 import bigframes.core.guid
 from bigframes.core.ordering import (
     encode_order_string,
@@ -96,16 +95,6 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]:
             else None
         )
 
-    @abc.abstractmethod
-    def select_columns(self: T, column_ids: typing.Sequence[str]) -> T:
-        """Creates a new expression based on this expression with new columns."""
-        ...
-
-    def drop_columns(self: T, columns: Iterable[str]) -> T:
-        return self.select_columns(
-            [col for col in self.column_ids if col not in columns]
-        )
-
     @abc.abstractmethod
     def filter(self: T, predicate_id: str, keep_null: bool = False) -> T:
         """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression."""
@@ -152,40 +141,26 @@ def _reproject_to_table(self: T) -> T:
         """
         ...
 
-    def project_expression(
+    def projection(
         self: T,
-        expression: expressions.Expression,
-        output_column_id: typing.Optional[str] = None,
+        expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...],
     ) -> T:
         """Apply an expression to the ArrayValue and assign the output to a column."""
-        result_id = (
-            output_column_id or expression.unbound_variables[0]
-        )  # overwrite input if not output id provided
-        bindings = {
-            col: self._get_ibis_column(col) for col in expression.unbound_variables
-        }
-        value = op_compiler.compile_expression(expression, bindings).name(result_id)
-        return self._set_or_replace_by_id(result_id, value)
+        bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
+        values = [
+            op_compiler.compile_expression(expression, bindings).name(id)
+            for expression, id in expression_id_pairs
+        ]
+        result = self._select(tuple(values))  # type: ignore
 
-    def assign(self: T, source_id: str, destination_id: str) -> T:
-        return self._set_or_replace_by_id(
-            destination_id, self._get_ibis_column(source_id)
-        )
+        # Need to reproject to convert ibis Scalar to ibis Column object
+        if any(exp_id[0].is_const for exp_id in expression_id_pairs):
+            result = result._reproject_to_table()
+        return result
 
-    def assign_constant(
-        self: T,
-        destination_id: str,
-        value: typing.Any,
-        dtype: typing.Optional[bigframes.dtypes.Dtype],
-    ) -> T:
-        # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis.
-        ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype)
-        if ibis_value is None:
-            raise NotImplementedError(
-                f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}"
-            )
-        expr = self._set_or_replace_by_id(destination_id, ibis_value)
-        return expr._reproject_to_table()
+    @abc.abstractmethod
+    def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T:
+        ...
 
     @abc.abstractmethod
     def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T:
@@ -330,14 +305,6 @@ def _to_ibis_expr(
             table = table.filter(ibis.random() < ibis.literal(fraction))
         return table
 
-    def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
-        """Creates a new expression based on this expression with new columns."""
-        columns = [self._get_ibis_column(col_id) for col_id in column_ids]
-        builder = self.builder()
-        builder.columns = list(columns)
-        new_expr = builder.build()
-        return new_expr
-
     def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR:
         condition = typing.cast(
             ibis_types.BooleanValue, self._get_ibis_column(predicate_id)
@@ -577,6 +544,11 @@ def _set_or_replace_by_id(
             builder.columns = [*self.columns, new_value.name(id)]
         return builder.build()
 
+    def _select(self, values: typing.Tuple[ibis_types.Value]) -> UnorderedIR:
+        builder = self.builder()
+        builder.columns = values
+        return builder.build()
+
     def _reproject_to_table(self) -> UnorderedIR:
         """
         Internal operators that projects the internal representation into a
@@ -816,20 +788,6 @@ def promote_offsets(self, col_id: str) -> OrderedIR:
         ]
         return expr_builder.build()
 
-    def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR:
-        """Creates a new expression based on this expression with new columns."""
-        columns = [self._get_ibis_column(col_id) for col_id in column_ids]
-        expr = self
-        for ordering_column in set(self.column_ids).intersection(
-            [col_ref.column_id for col_ref in self._ordering.ordering_value_columns]
-        ):
-            # Need to hide ordering columns that are being dropped. Alternatively, could project offsets
-            expr = expr._hide_column(ordering_column)
-        builder = expr.builder()
-        builder.columns = list(columns)
-        new_expr = builder.build()
-        return new_expr
-
     ## Methods that only work with ordering
     def project_window_op(
         self,
@@ -1221,6 +1179,29 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> Ordered
             builder.columns = [*self.columns, new_value.name(id)]
         return builder.build()
 
+    def _select(self, values: typing.Tuple[ibis_types.Value]) -> OrderedIR:
+        """Safely assign by id while maintaining ordering integrity."""
+        # TODO: Split into explicit set and replace methods
+        ordering_col_ids = [
+            col_ref.column_id for col_ref in self._ordering.ordering_value_columns
+        ]
+        ir = self
+        mappings = {value.name: value for value in values}
+        for ordering_id in ordering_col_ids:
+            # Drop case
+            if (ordering_id not in mappings) and (ordering_id in ir.column_ids):
+                # id is being dropped, hide it first
+                ir = ir._hide_column(ordering_id)
+            # Mutate case
+            elif (ordering_id in mappings) and not mappings[ordering_id].equals(
+                ir._get_any_column(ordering_id)
+            ):
+                ir = ir._hide_column(ordering_id)
+
+        builder = ir.builder()
+        builder.columns = list(values)
+        return builder.build()
+
     ## Ordering specific helpers
     def _get_any_column(self, key: str) -> ibis_types.Value:
         """Gets the Ibis expression for a given column. Will also get hidden columns."""