From e0a3f14323d3027e12ead403ae8aa1b9975d3fd9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 11 Jan 2024 22:43:58 +0000 Subject: [PATCH 1/5] refactor: remove 'partial' ops and replace with expressions --- bigframes/core/block_transforms.py | 155 ++++++++----------- bigframes/core/blocks.py | 125 +++++++-------- bigframes/core/compile/scalar_op_compiler.py | 33 ---- bigframes/core/expression.py | 22 +++ bigframes/core/groupby/__init__.py | 12 +- bigframes/core/indexers.py | 18 +-- bigframes/core/indexes/index.py | 25 +-- bigframes/core/reshape/__init__.py | 5 +- bigframes/dataframe.py | 85 +++++----- bigframes/operations/__init__.py | 94 ++++------- bigframes/operations/base.py | 25 +-- bigframes/pandas/__init__.py | 12 +- bigframes/series.py | 32 ++-- 13 files changed, 284 insertions(+), 359 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 0b6886562e..96eb7a49e3 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows import bigframes.dtypes as dtypes @@ -44,11 +45,10 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: for lcol, rcol in zip(block1.value_columns, block2.value_columns): lcolmapped = lmap[lcol] rcolmapped = rmap[rcol] - joined_block, result_id = joined_block.apply_binary_op( - lcolmapped, rcolmapped, ops.eq_null_match_op - ) - joined_block, result_id = joined_block.apply_unary_op( - result_id, ops.partial_right(ops.fillna_op, False) + joined_block, result_id = joined_block.project_expr( + ops.fillna_op.as_expr( + ops.eq_null_match_op.as_expr(lcolmapped, rcolmapped), ex.const(False) + ) ) equality_ids.append(result_id) @@ -91,9 +91,8 @@ def indicate_duplicates( agg_ops.count_op, window_spec=window_spec, ) - block, duplicate_indicator = block.apply_unary_op( - val_count_col_id, - ops.partial_right(ops.gt_op, 1), + block, duplicate_indicator = block.project_expr( + ops.gt_op.as_expr(val_count_col_id, ex.const(1)) ) return ( block.drop_columns( @@ -183,8 +182,8 @@ def _interpolate_column( # Note, this method may block, notnull = block.apply_unary_op(column, ops.notnull_op) - block, masked_offsets = block.apply_binary_op( - x_values, notnull, ops.partial_arg3(ops.where_op, None) + block, masked_offsets = block.project_expr( + ops.where_op.as_expr(x_values, notnull, ex.const(None)) ) block, previous_value = block.apply_window_op( @@ -271,25 +270,22 @@ def _interpolate_points_nearest( xpredict_id: str, ) -> typing.Tuple[blocks.Block, str]: """Interpolate by taking the y value of the nearest x value""" - block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) - block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op) + left_diff = ops.sub_op.as_expr(xpredict_id, x0_id) + right_diff = ops.sub_op.as_expr(x1_id, xpredict_id) # If diffs equal, choose left - block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op) - block, choose_left = block.apply_unary_op( - choose_left, ops.partial_right(ops.fillna_op, False) + choose_left = ops.fillna_op.as_expr( + ops.le_op.as_expr(left_diff, right_diff), ex.const(False) ) - block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op) - - block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op) - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op) + nearest = ops.where_op.as_expr(y0_id, choose_left, y1_id) - block, prediction_id = block.apply_binary_op( - nearest, is_interpolation, ops.partial_arg3(ops.where_op, None) + is_interpolation = ops.and_op.as_expr( + ops.notnull_op.as_expr(y0_id), ops.notnull_op.as_expr(y1_id) ) - return block, prediction_id + return block.project_expr( + ops.where_op.as_expr(nearest, is_interpolation, ex.const(None)) + ) def _interpolate_points_ffill( @@ -302,11 +298,9 @@ def _interpolate_points_ffill( ) -> typing.Tuple[blocks.Block, str]: """Interpolates by using the preceding values""" # check for existance of y1, otherwise we are extrapolating instead of interpolating - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, prediction_id = block.apply_binary_op( - y0_id, y1_exists, ops.partial_arg3(ops.where_op, None) + return block.project_expr( + ops.where_op.as_expr(y0_id, ops.notnull_op.as_expr(y1_id), ex.const(None)) ) - return block, prediction_id def drop_duplicates( @@ -519,9 +513,7 @@ def nsmallest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -551,9 +543,7 @@ def nlargest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -641,7 +631,7 @@ def kurt( def _mean_delta_to_power( block: blocks.Block, - n_power, + n_power: int, column_ids: typing.Sequence[str], grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: @@ -649,11 +639,10 @@ def _mean_delta_to_power( window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] - cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): - block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) - block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns([delta_id]) + delta = ops.sub_op.as_expr(val_id, mean_val_id) + delta_power = ops.pow_op.as_expr(delta, ex.const(n_power)) + block, delta_power_id = block.project_expr(delta_power) delta_ids.append(delta_power_id) return block, delta_ids @@ -664,31 +653,24 @@ def _skew_from_moments_and_count( # Calculate skew using count, third moment and population variance # See G1 estimator: # https://en.wikipedia.org/wiki/Skewness#Sample_skewness - block, denominator_id = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 3 / 2) - ) - block, base_id = block.apply_binary_op(moment3_id, denominator_id, ops.div_op) - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, adjustment_id = block.apply_binary_op(count_id, countminus1_id, ops.mul_op) - block, adjustment_id = block.apply_unary_op( - adjustment_id, ops.partial_right(ops.unsafe_pow_op, 1 / 2) - ) - block, adjustment_id = block.apply_binary_op( - adjustment_id, countminus2_id, ops.div_op + denominator = ops.pow_op.as_expr(moment2_id, ex.const(3 / 2)) + base = ops.div_op.as_expr(moment3_id, denominator) + countminus1 = ops.sub_op.as_expr(count_id, ex.const(1)) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(1)) + adjustment = ops.div_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.mul_op.as_expr(count_id, countminus1), ex.const(1 / 2) + ), + countminus2, ) - block, skew_id = block.apply_binary_op(base_id, adjustment_id, ops.mul_op) + + skew = ops.mul_op.as_expr(base, adjustment) # Need to produce NA if have less than 3 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 3)) - block, skew_id = block.apply_binary_op( - skew_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_skew = ops.where_op.as_expr( + skew, ops.ge_op.as_expr(count_id, ex.const(3)), ex.const(None) ) - return block, skew_id + return block.project_expr(cleaned_skew) def _kurt_from_moments_and_count( @@ -701,49 +683,42 @@ def _kurt_from_moments_and_count( # adjustment = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) # kurtosis = (numerator / denominator) - adjustment - # Numerator - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countplus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.add_op, 1) + numerator = ops.mul_op.as_expr( + moment4_id, + ops.mul_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), + ops.add_op.as_expr(count_id, ex.const(1)), + ), ) - block, num_adj = block.apply_binary_op(countplus1_id, countminus1_id, ops.mul_op) - block, numerator_id = block.apply_binary_op(moment4_id, num_adj, ops.mul_op) # Denominator - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, countminus3_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 3) - ) - block, denom_adj = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, popvar_squared = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 2) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) + countminus3 = ops.sub_op.as_expr(count_id, ex.const(3)) + + # Denominator + denominator = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr(moment2_id, ex.const(2)), + ops.mul_op.as_expr(countminus2, countminus3), ) - block, denominator_id = block.apply_binary_op(popvar_squared, denom_adj, ops.mul_op) # Adjustment - block, countminus1_square = block.apply_unary_op( - countminus1_id, ops.partial_right(ops.unsafe_pow_op, 2) - ) - block, adj_num = block.apply_unary_op( - countminus1_square, ops.partial_right(ops.mul_op, 3) + adj_num = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), ex.const(2) + ), + ex.const(3), ) - block, adj_denom = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, adjustment_id = block.apply_binary_op(adj_num, adj_denom, ops.div_op) + adj_denom = ops.mul_op.as_expr(countminus2, countminus3) + adjustment = ops.div_op.as_expr(adj_num, adj_denom) # Combine - block, base_id = block.apply_binary_op(numerator_id, denominator_id, ops.div_op) - block, kurt_id = block.apply_binary_op(base_id, adjustment_id, ops.sub_op) + kurt = ops.sub_op.as_expr(ops.div_op.as_expr(numerator, denominator), adjustment) # Need to produce NA if have less than 4 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 4)) - block, kurt_id = block.apply_binary_op( - kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_kurt = ops.where_op.as_expr( + kurt, ops.ge_op.as_expr(count_id, ex.const(4)), ex.const(None) ) - return block, kurt_id + return block.project_expr(cleaned_kurt) def align( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 57a497d2cb..7ac10df510 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -33,6 +33,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.joins.name_resolution as join_names @@ -661,23 +662,32 @@ def with_index_labels(self, value: typing.Sequence[Label]) -> Block: index_labels=tuple(value), ) - def apply_unary_op( - self, column: str, op: ops.UnaryOp, result_label: Label = None + def project_expr( + self, expr: ex.Expression, label: Label = None ) -> typing.Tuple[Block, str]: """ - Apply a unary op to the block. Creates a new column to store the result. + Apply a scalar expression to the block. Creates a new column to store the result. """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - expr = self._expr.project(op.as_expr(column), result_id) + array_val = self._expr.project(expr, result_id) block = Block( - expr, + array_val, index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], + column_labels=[*self.column_labels, label], index_labels=self.index.names, ) return (block, result_id) + def apply_unary_op( + self, column: str, op: ops.UnaryOp, result_label: Label = None + ) -> typing.Tuple[Block, str]: + """ + Apply a unary op to the block. Creates a new column to store the result. + """ + expr = op.as_expr(column) + return self.project_expr(expr, result_label) + def apply_binary_op( self, left_column_id: str, @@ -685,17 +695,8 @@ def apply_binary_op( op: ops.BinaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project( - op.as_expr(left_column_id, right_column_id), result_id - ) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(left_column_id, right_column_id) + return self.project_expr(expr, result_label) def apply_ternary_op( self, @@ -705,15 +706,8 @@ def apply_ternary_op( op: ops.TernaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project(op.as_expr(col_id_1, col_id_2, col_id_3), result_id) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(col_id_1, col_id_2, col_id_3) + return self.project_expr(expr, result_label) def multi_apply_window_op( self, @@ -1151,43 +1145,37 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): conditions = [] if start != 0: if start > 0: - op = ops.partial_right(ops.ge_op, start) assert positive_offsets - block, start_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.ge_op.as_expr(positive_offsets, ex.const(start))) else: - op = ops.partial_right(ops.le_op, -start - 1) assert negative_offsets - block, start_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(start_cond) + conditions.append( + ops.le_op.as_expr(negative_offsets, ex.const(-start - 1)) + ) if stop is not None: if stop >= 0: - op = ops.partial_right(ops.lt_op, stop) assert positive_offsets - block, stop_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.lt_op.as_expr(positive_offsets, ex.const(stop))) else: - op = ops.partial_right(ops.gt_op, -stop - 1) assert negative_offsets - block, stop_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(stop_cond) - + conditions.append( + ops.gt_op.as_expr(negative_offsets, ex.const(-stop - 1)) + ) if step > 1: - op = ops.partial_right(ops.mod_op, step) if start >= 0: - op = ops.partial_right(ops.sub_op, start) assert positive_offsets - block, start_diff = block.apply_unary_op(positive_offsets, op) + start_diff = ops.sub_op.as_expr(positive_offsets, ex.const(start)) else: - op = ops.partial_right(ops.sub_op, -start + 1) assert negative_offsets - block, start_diff = block.apply_unary_op(negative_offsets, op) - modulo_op = ops.partial_right(ops.mod_op, step) - block, mod = block.apply_unary_op(start_diff, modulo_op) - is_zero_op = ops.partial_right(ops.eq_op, 0) - block, step_cond = block.apply_unary_op(mod, is_zero_op) + start_diff = ops.sub_op.as_expr(negative_offsets, ex.const(-start + 1)) + step_cond = ops.eq_op.as_expr( + ops.mod_op.as_expr(start_diff, ex.const(step)), ex.const(0) + ) conditions.append(step_cond) for cond in conditions: - block = block.filter(cond) + block, cond_id = block.project_expr(cond) + block = block.filter(cond_id) return block.select_columns(self.value_columns) @@ -1238,13 +1226,12 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), - output_id=index_col, + add_prefix = ops.add_op.as_expr( + ex.const(prefix), ops.AsTypeOp(to_type="string").as_expr(index_col) ) - prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) expr = expr.project( - expression=prefix_op.as_expr(index_col), output_id=index_col + expression=add_prefix, + output_id=index_col, ) return Block( expr, @@ -1262,13 +1249,12 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project( - expression=ops.AsTypeOp(to_type="string").as_expr(index_col), - output_id=index_col, + add_suffix = ops.add_op.as_expr( + ops.AsTypeOp(to_type="string").as_expr(index_col), ex.const(suffix) ) - prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) expr = expr.project( - expression=prefix_op.as_expr(index_col), output_id=index_col + expression=add_suffix, + output_id=index_col, ) return Block( expr, @@ -1468,28 +1454,23 @@ def _create_pivot_column_index( def _create_pivot_col( block: Block, columns: typing.Sequence[str], value_col: str, value ) -> typing.Tuple[Block, str]: - cond_id = "" + condition: typing.Optional[ex.Expression] nlevels = len(columns) for i in range(len(columns)): uvalue_level = value[i] if nlevels > 1 else value if pd.isna(uvalue_level): - block, eq_id = block.apply_unary_op( - columns[i], - ops.isnull_op, - ) + equality = ops.isnull_op.as_expr(ex.const(columns[i])) else: - block, eq_id = block.apply_unary_op( - columns[i], ops.partial_right(ops.eq_op, uvalue_level) - ) - if cond_id: - block, cond_id = block.apply_binary_op(eq_id, cond_id, ops.and_op) + equality = ops.eq_op.as_expr(ex.const(columns[i]), uvalue_level) + if condition is not None: + condition = ops.and_op.as_expr(equality, condition) else: - cond_id = eq_id - block, masked_id = block.apply_binary_op( - value_col, cond_id, ops.partial_arg3(ops.where_op, None) - ) + condition = equality - return block, masked_id + assert condition is not None + return block.project_expr( + ops.where_op.as_expr(value_col, condition, ex.const(None)) + ) def _get_unique_values( self, columns: Sequence[str], max_unique_values: int diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 2331d3aa28..67f1800415 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -28,7 +28,6 @@ import bigframes.constants as constants import bigframes.core.expression as expressions import bigframes.dtypes -import bigframes.dtypes as dtypes import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) @@ -1132,38 +1131,6 @@ def clip_op( ) -# Composition Ops -@scalar_op_compiler.register_unary_op(ops.ApplyRight, pass_op=True) -def apply_right(input: ibis_types.Value, op: ops.ApplyRight): - right = dtypes.literal_to_ibis_scalar(op.right_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input, right)) - - -@scalar_op_compiler.register_unary_op(ops.ApplyLeft, pass_op=True) -def apply_left(input: ibis_types.Value, op: ops.ApplyLeft): - left = dtypes.literal_to_ibis_scalar(op.left_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (left, input)) - - -@scalar_op_compiler.register_binary_op(ops.ReverseArgsOp, pass_op=True) -def apply_reversed( - input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ReverseArgsOp -): - return scalar_op_compiler.compile_row_op(op.base_op, (input2, input1)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg1, pass_op=True) -def apply_arg1(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg1): - arg1 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (arg1, input1, input2)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg3, pass_op=True) -def apply_arg3(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg3): - arg3 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input1, input2, arg3)) - - # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 2fb1ccb988..a8b13ef61f 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -22,6 +22,14 @@ import bigframes.operations +def const(value: typing.Hashable): + return ScalarConstantExpression(value) + + +def free_var(id: str): + return UnboundVariableExpression(id) + + @dataclasses.dataclass(frozen=True) class Expression(abc.ABC): """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" @@ -30,6 +38,9 @@ class Expression(abc.ABC): def unbound_variables(self) -> typing.Tuple[str, ...]: return () + def rename(self, name_mapping: dict[str, str]): + return self + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -49,6 +60,12 @@ class UnboundVariableExpression(Expression): def unbound_variables(self) -> typing.Tuple[str, ...]: return (self.id,) + def rename(self, name_mapping: dict[str, str]): + if self.id in name_mapping: + UnboundVariableExpression(name_mapping[self.id]) + else: + return self + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): @@ -67,3 +84,8 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: map(lambda x: x.unbound_variables, self.inputs) ) ) + + def rename(self, name_mapping: dict[str, str]): + return OpExpression( + self.op, tuple(input.rename(name_mapping) for input in self.inputs) + ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 66ba901649..ab6b15e7b9 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -28,7 +28,6 @@ import bigframes.core.window as windows import bigframes.dataframe as df import bigframes.dtypes as dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -540,10 +539,13 @@ def cummin(self, *args, **kwargs) -> series.Series: ) def cumcount(self, *args, **kwargs) -> series.Series: - return self._apply_window_op( - agg_ops.rank_op, - discard_name=True, - )._apply_unary_op(ops.partial_right(ops.sub_op, 1)) + return ( + self._apply_window_op( + agg_ops.rank_op, + discard_name=True, + ) + - 1 + ) def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 12a1303d29..b03fdfbcfe 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -22,6 +22,7 @@ import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar @@ -63,17 +64,14 @@ def __setitem__(self, key, value) -> None: index_column = block.index_columns[0] # if index == key return value else value_colum - block, insert_cond = block.apply_unary_op( - index_column, ops.partial_right(ops.eq_op, key) - ) - block, result_id = block.apply_binary_op( - insert_cond, - self._series._value_column, - ops.partial_arg1(ops.where_op, value), - ) - block = block.copy_values(result_id, value_column).drop_columns( - [insert_cond, result_id] + blcok, result_id = block.project_expr( + ops.where_op.as_expr( + ex.const(value), + ops.eq_op.as_expr(index_column, ex.const(key)), + self._series.value_column, + ) ) + block = block.copy_values(result_id, value_column).drop_columns([result_id]) self._series._set_block(block) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6602170b5f..378cb3cac3 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,6 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -186,7 +187,7 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) + return self._apply_unary_expr(ops.AsTypeOp(to_type=dtype).as_expr("arg")) def all(self) -> bool: if self.nlevels > 1: @@ -261,7 +262,7 @@ def value_counts( def fillna(self, value=None) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'fillna'") - return self._apply_unary_op(ops.partial_right(ops.fillna_op, value)) + return self._apply_unary_expr(ops.fillna_op.as_expr("arg", ex.const(value))) def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) @@ -284,8 +285,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, labels) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(labels)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -308,19 +309,25 @@ def isin(self, values) -> Index: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op( - ops.IsInOp(values=tuple(values), match_nulls=True) + return self._apply_unary_expr( + ops.IsInOp(values=tuple(values), match_nulls=True).as_expr("arg") ).fillna(value=False) - def _apply_unary_op( + def _apply_unary_expr( self, - op: ops.UnaryOp, + op: ex.Expression, ) -> Index: """Applies a unary operator to the index.""" + if len(op.unbound_variables) != 1: + raise ValueError("Expression must have exactly 1 unbound variable.") + unbound_variable = op.unbound_variables[0] + block = self._block result_ids = [] for col in self._block.index_columns: - block, result_id = block.apply_unary_op(col, op) + block, result_id = block.project_expr( + op.rename({unbound_variable: ex.free_var(col)}) + ) result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index d9cc99a036..cadd8e5145 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe @@ -165,7 +166,7 @@ def qcut( ordering=(order.OrderingColumnReference(x._value_column),), ), ) - block, result = block.apply_binary_op( - result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + block, result = block.project_expr( + ops.where_op.as_expr(result, nullity_id, ex.const(None)), label=label ) return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f039904f0..1fa4c70ab6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -47,6 +47,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid import bigframes.core.indexers as indexers @@ -656,25 +657,34 @@ def _apply_binop( op, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ): if isinstance(other, (float, int)): - return self._apply_scalar_binop(other, op) + return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop(other, op, axis=axis, how=how) + return self._apply_series_binop( + other, op, axis=axis, how=how, reverse=reverse + ) elif isinstance(other, DataFrame): - return self._apply_dataframe_binop(other, op, how=how) + return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" ) - def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: + def _apply_scalar_binop( + self, other: float | int, op: ops.BinaryOp, reverse: bool = False + ) -> DataFrame: block = self._block - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_unary_op(column_id, partial_op, result_label=label) + expr = ( + op.as_expr(ex.const(other), column_id) + if reverse + else op.as_expr(column_id, ex.const(other)) + ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([column_id]) return DataFrame(block) @@ -684,6 +694,7 @@ def _apply_series_binop( op: ops.BinaryOp, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ) -> DataFrame: if axis not in ("columns", "index", 0, 1): raise ValueError(f"Invalid input: axis {axis}.") @@ -703,12 +714,13 @@ def _apply_series_binop( for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_binary_op( - get_column_left[column_id], - series_col, - op, - result_label=label, + self_col = get_column_left[column_id] + expr = ( + op.as_expr(series_col, self_col) + if reverse + else op.as_expr(self_col, series_col) ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([get_column_left[column_id]]) block = block.drop_columns([series_col]) @@ -716,7 +728,11 @@ def _apply_series_binop( return DataFrame(block) def _apply_dataframe_binop( - self, other: DataFrame, op: ops.BinaryOp, how: str = "outer" + self, + other: DataFrame, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, ) -> DataFrame: # Join rows joined_index, (get_column_left, get_column_right) = self._block.index.join( @@ -724,9 +740,11 @@ def _apply_dataframe_binop( ) # join columns schema # indexers will be none for exact match - columns, lcol_indexer, rcol_indexer = self.columns.join( + columns, self_indexer, other_indexer = self.columns.join( other.columns, how=how, return_indexers=True ) + lcol_indexer = self_indexer if not reverse else other_indexer + rcol_indexer = other_indexer if not reverse else self_indexer binop_result_ids = [] block = joined_index._block @@ -740,29 +758,18 @@ def _apply_dataframe_binop( if left_index >= 0 and right_index >= 0: # -1 indices indicate missing left_col_id = self._block.value_columns[left_index] right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_binary_op( - get_column_left[left_col_id], - get_column_right[right_col_id], - op, - ) - binop_result_ids.append(result_col_id) + expr = op.as_expr(left_col_id, right_col_id) elif left_index >= 0: left_col_id = self._block.value_columns[left_index] - block, result_col_id = block.apply_unary_op( - get_column_left[left_col_id], - ops.partial_right(op, None), - ) - binop_result_ids.append(result_col_id) + expr = op.as_expr(left_col_id, ex.const(None)) elif right_index >= 0: right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_unary_op( - get_column_right[right_col_id], - ops.partial_left(op, None), - ) - binop_result_ids.append(result_col_id) + expr = op.as_expr(ex.const(None), right_col_id) else: # Should not be possible raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) block = block.select_columns(binop_result_ids).with_column_labels(columns) return DataFrame(block) @@ -822,7 +829,7 @@ def rsub( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.sub_op), axis=axis) + return self._apply_binop(other, ops.sub_op, axis=axis, reverse=True) __rsub__ = rsub @@ -849,7 +856,7 @@ def rtruediv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.div_op), axis=axis) + return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) __rtruediv__ = rdiv = rtruediv @@ -867,7 +874,7 @@ def rfloordiv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.floordiv_op), axis=axis) + return self._apply_binop(other, ops.floordiv_op, axis=axis, reverse=True) __rfloordiv__ = rfloordiv @@ -875,7 +882,7 @@ def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int return self._apply_binop(other, ops.mod_op, axis=axis) def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore - return self._apply_binop(other, ops.reverse(ops.mod_op), axis=axis) + return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) __mod__ = mod @@ -889,7 +896,7 @@ def pow( def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.pow_op), axis=axis) + return self._apply_binop(other, ops.pow_op, axis=axis, reverse=True) __pow__ = pow @@ -1101,8 +1108,8 @@ def drop( condition_id = None for i, idx in enumerate(index): level_id = self._resolve_levels(i)[0] - block, condition_id_cur = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, idx) + block, condition_id_cur = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(idx)) ) if condition_id: block, condition_id = block.apply_binary_op( @@ -1122,8 +1129,8 @@ def drop( elif isinstance(index, indexes.Index): return self._drop_by_index(index) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True).select_columns( self._block.value_columns @@ -3031,7 +3038,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binop(inputs[1], binop) else: - return self._apply_binop(inputs[0], ops.reverse(binop)) + return self._apply_binop(inputs[0], binop, reverse=True) return NotImplemented diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 39ca52394e..9737df94f9 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -49,11 +49,13 @@ def name(self) -> str: def arguments(self) -> int: return 1 - def as_expr(self, input_id: str) -> bigframes.core.expression.Expression: + def as_expr( + self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" + ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( - self, (bigframes.core.expression.UnboundVariableExpression(input_id),) + self, (_convert_expr_input(input_id),) ) @@ -68,15 +70,17 @@ def arguments(self) -> int: return 2 def as_expr( - self, left_input: str, right_input: str + self, + left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + right_input: typing.Union[str, bigframes.core.expression.Expression] = "arg2", ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( self, ( - bigframes.core.expression.UnboundVariableExpression(left_input), - bigframes.core.expression.UnboundVariableExpression(right_input), + _convert_expr_input(left_input), + _convert_expr_input(right_input), ), ) @@ -92,20 +96,35 @@ def arguments(self) -> int: return 3 def as_expr( - self, input1: str, input2: str, input3: str + self, + input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + input2: typing.Union[str, bigframes.core.expression.Expression] = "arg2", + input3: typing.Union[str, bigframes.core.expression.Expression] = "arg3", ) -> bigframes.core.expression.Expression: import bigframes.core.expression return bigframes.core.expression.OpExpression( self, ( - bigframes.core.expression.UnboundVariableExpression(input1), - bigframes.core.expression.UnboundVariableExpression(input2), - bigframes.core.expression.UnboundVariableExpression(input3), + _convert_expr_input(input1), + _convert_expr_input(input2), + _convert_expr_input(input3), ), ) +def _convert_expr_input( + input: typing.Union[str, bigframes.core.expression.Expression] +) -> bigframes.core.expression.Expression: + """Allows creating free variables with just a string""" + import bigframes.core.expression + + if isinstance(input, str): + return bigframes.core.expression.UnboundVariableExpression(input) + else: + return input + + # Operation Factories def create_unary_op(name: str) -> UnaryOp: return dataclasses.make_dataclass( @@ -309,63 +328,6 @@ class MapOp(UnaryOp): mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...] -# Operation Composition -# Meta-ops that do partial application or parameter remapping -# Subject to change, may convert to explicit tree -@dataclasses.dataclass(frozen=True) -class ApplyRight(UnaryOp): - name: typing.ClassVar[str] = "apply_right" - base_op: BinaryOp - right_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyLeft(UnaryOp): - name: typing.ClassVar[str] = "apply_left" - base_op: BinaryOp - left_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg1(BinaryOp): - name: typing.ClassVar[str] = "apply_arg1" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg3(BinaryOp): - name: typing.ClassVar[str] = "apply_arg3" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ReverseArgsOp(BinaryOp): - name: typing.ClassVar[str] = "apply_reverse" - base_op: BinaryOp - - -def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyLeft(base_op=op, left_scalar=scalar) - - -def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyRight(base_op=op, right_scalar=scalar) - - -def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg1(base_op=op, scalar=scalar) - - -def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg3(base_op=op, scalar=scalar) - - -def reverse(op: BinaryOp) -> BinaryOp: - return ReverseArgsOp(base_op=op) - - # Binary Ops fillna_op = create_binary_op(name="fillna") cliplower_op = create_binary_op(name="clip_lower") diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 8989255f7e..077815a9d6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops @@ -136,6 +137,7 @@ def _apply_binary_op( other: typing.Any, op: ops.BinaryOp, alignment: typing.Literal["outer", "left"] = "outer", + reverse: bool = False, ) -> series.Series: """Applies a binary operator to the series and other.""" if isinstance(other, pd.Series): @@ -144,11 +146,7 @@ def _apply_binary_op( f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): - (left, right, block) = self._align(other, how=alignment) - - block, result_id = block.apply_binary_op( - left, right, op, self._value_column - ) + (self_col, other_col, block) = self._align(other, how=alignment) name = self._name if ( @@ -157,13 +155,20 @@ def _apply_binary_op( and alignment == "outer" ): name = None - - return series.Series( - block.select_column(result_id).assign_label(result_id, name) + expr = op.as_expr( + other_col if reverse else self_col, self_col if reverse else other_col ) + block, result_id = block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) + else: - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) - return self._apply_unary_op(partial_op) + name = self._name + expr = op.as_expr( + ex.const(other) if reverse else self._value_column, + self._value_column if reverse else ex.const(other), + ) + block, result_id = self._block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) def _apply_corr_aggregation(self, other: series.Series) -> float: (left, right, block) = self._align(other, how="outer") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0f060a23e8..554acda202 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -48,6 +48,7 @@ import bigframes._config as config import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape @@ -294,14 +295,13 @@ def _perform_get_dummies_block_operations( new_column_label = f"{column_label}{value}" if column_label == "": new_column_label = value - new_block, new_id = block.apply_unary_op( - column_id, ops.ApplyLeft(ops.eq_op, value) + new_block, new_id = block.project_expr( + ops.eq_op.as_expr(column_id, ex.const(value)) ) intermediate_col_ids.append(new_id) - block, _ = new_block.apply_unary_op( - new_id, - ops.ApplyRight(ops.fillna_op, False), - result_label=new_column_label, + block, _ = new_block.project_expr( + ops.fillna_op.as_expr(new_id, ex.const(False)), + label=new_column_label, ) if dummy_na: # dummy column name for na depends on the dtype diff --git a/bigframes/series.py b/bigframes/series.py index 8f564423fc..aaac896d72 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -34,6 +34,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes @@ -188,8 +189,8 @@ def rename( # Will throw if value type isn't compatible with index type. block, const_id = block.create_constant(v, dtype=idx_dtype) - block, cond_id = block.apply_unary_op( - idx_id, ops.ApplyRight(base_op=ops.ne_op, right_scalar=k) + block, cond_id = block.project_expr( + ops.ne_op.as_expr(idx_id, ex.const(k)) ) block, new_idx_id = block.apply_ternary_op( idx_id, cond_id, const_id, ops.where_op @@ -343,8 +344,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -489,11 +490,8 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(tuple(to_replace_list)) ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = block.project_expr( + ops.where_op.as_expr("true_value", cond, self._value_column), self.name ) return Series(block.select_column(result_col)) @@ -606,7 +604,7 @@ def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) def radd(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.add_op)) + return self._apply_binary_op(other, ops.add_op, reverse=True) def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) @@ -618,7 +616,7 @@ def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) def rsub(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.sub_op)) + return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub @@ -632,7 +630,7 @@ def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) def rmul(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.mul_op)) + return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul @@ -646,7 +644,7 @@ def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.div_op)) + return self._apply_binary_op(other, ops.div_op, reverse=True) div = truediv @@ -664,7 +662,7 @@ def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) def rfloordiv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.floordiv_op)) + return self._apply_binary_op(other, ops.floordiv_op, reverse=True) def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) @@ -676,7 +674,7 @@ def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) def rpow(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.pow_op)) + return self._apply_binary_op(other, ops.pow_op, reverse=True) def __lt__(self, other: float | int | Series) -> Series: # type: ignore return self.lt(other) @@ -712,7 +710,7 @@ def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) def rmod(self, other) -> Series: # type: ignore - return self._apply_binary_op(other, ops.reverse(ops.mod_op)) + return self._apply_binary_op(other, ops.mod_op, reverse=True) def divmod(self, other) -> Tuple[Series, Series]: # type: ignore # TODO(huanc): when self and other both has dtype int and other contains zeros, @@ -1524,7 +1522,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binary_op(inputs[1], binop) else: - return self._apply_binary_op(inputs[0], ops.reverse(binop)) + return self._apply_binary_op(inputs[0], binop, reverse=True) return NotImplemented From f97285b5648d26cf3d3dc11d9db44570408e3f19 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 00:12:08 +0000 Subject: [PATCH 2/5] fix bugs --- bigframes/core/block_transforms.py | 10 ++++++---- bigframes/core/blocks.py | 6 +++--- bigframes/core/compile/scalar_op_compiler.py | 2 ++ bigframes/core/expression.py | 12 ++++++------ bigframes/core/indexers.py | 2 +- bigframes/core/indexes/index.py | 4 +--- 6 files changed, 19 insertions(+), 17 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 96eb7a49e3..345adb6be3 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -653,10 +653,12 @@ def _skew_from_moments_and_count( # Calculate skew using count, third moment and population variance # See G1 estimator: # https://en.wikipedia.org/wiki/Skewness#Sample_skewness - denominator = ops.pow_op.as_expr(moment2_id, ex.const(3 / 2)) - base = ops.div_op.as_expr(moment3_id, denominator) + moments_estimator = ops.div_op.as_expr( + moment3_id, ops.pow_op.as_expr(moment2_id, ex.const(3 / 2)) + ) + countminus1 = ops.sub_op.as_expr(count_id, ex.const(1)) - countminus2 = ops.sub_op.as_expr(count_id, ex.const(1)) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) adjustment = ops.div_op.as_expr( ops.unsafe_pow_op.as_expr( ops.mul_op.as_expr(count_id, countminus1), ex.const(1 / 2) @@ -664,7 +666,7 @@ def _skew_from_moments_and_count( countminus2, ) - skew = ops.mul_op.as_expr(base, adjustment) + skew = ops.mul_op.as_expr(moments_estimator, adjustment) # Need to produce NA if have less than 3 data points cleaned_skew = ops.where_op.as_expr( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 7ac10df510..66fd592cf1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1454,14 +1454,14 @@ def _create_pivot_column_index( def _create_pivot_col( block: Block, columns: typing.Sequence[str], value_col: str, value ) -> typing.Tuple[Block, str]: - condition: typing.Optional[ex.Expression] + condition: typing.Optional[ex.Expression] = None nlevels = len(columns) for i in range(len(columns)): uvalue_level = value[i] if nlevels > 1 else value if pd.isna(uvalue_level): - equality = ops.isnull_op.as_expr(ex.const(columns[i])) + equality = ops.isnull_op.as_expr(columns[i]) else: - equality = ops.eq_op.as_expr(ex.const(columns[i]), uvalue_level) + equality = ops.eq_op.as_expr(columns[i], ex.const(uvalue_level)) if condition is not None: condition = ops.and_op.as_expr(equality, condition) else: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 67f1800415..f0bfacadc4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -64,6 +64,8 @@ def _( expression: expressions.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: + if pd.isnull(expression.value): + return ibis.null() return ibis.literal(expression.value) @compile_expression.register diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index a8b13ef61f..59c3e595d9 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -22,11 +22,11 @@ import bigframes.operations -def const(value: typing.Hashable): +def const(value: typing.Hashable) -> Expression: return ScalarConstantExpression(value) -def free_var(id: str): +def free_var(id: str) -> Expression: return UnboundVariableExpression(id) @@ -38,7 +38,7 @@ class Expression(abc.ABC): def unbound_variables(self) -> typing.Tuple[str, ...]: return () - def rename(self, name_mapping: dict[str, str]): + def rename(self, name_mapping: dict[str, str]) -> Expression: return self @@ -60,9 +60,9 @@ class UnboundVariableExpression(Expression): def unbound_variables(self) -> typing.Tuple[str, ...]: return (self.id,) - def rename(self, name_mapping: dict[str, str]): + def rename(self, name_mapping: dict[str, str]) -> Expression: if self.id in name_mapping: - UnboundVariableExpression(name_mapping[self.id]) + return UnboundVariableExpression(name_mapping[self.id]) else: return self @@ -85,7 +85,7 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: ) ) - def rename(self, name_mapping: dict[str, str]): + def rename(self, name_mapping: dict[str, str]) -> Expression: return OpExpression( self.op, tuple(input.rename(name_mapping) for input in self.inputs) ) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index b03fdfbcfe..9583a6e406 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -64,7 +64,7 @@ def __setitem__(self, key, value) -> None: index_column = block.index_columns[0] # if index == key return value else value_colum - blcok, result_id = block.project_expr( + block, result_id = block.project_expr( ops.where_op.as_expr( ex.const(value), ops.eq_op.as_expr(index_column, ex.const(key)), diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 378cb3cac3..c02fdbb12d 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -325,9 +325,7 @@ def _apply_unary_expr( block = self._block result_ids = [] for col in self._block.index_columns: - block, result_id = block.project_expr( - op.rename({unbound_variable: ex.free_var(col)}) - ) + block, result_id = block.project_expr(op.rename({unbound_variable: col})) result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) From 5018d29d3190aae8fe518426e5ef298c36abbf57 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 00:21:30 +0000 Subject: [PATCH 3/5] fix more bugs --- bigframes/core/indexers.py | 2 +- bigframes/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 9583a6e406..6998d0e974 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -68,7 +68,7 @@ def __setitem__(self, key, value) -> None: ops.where_op.as_expr( ex.const(value), ops.eq_op.as_expr(index_column, ex.const(key)), - self._series.value_column, + self._series._value_column, ) ) block = block.copy_values(result_id, value_column).drop_columns([result_id]) diff --git a/bigframes/series.py b/bigframes/series.py index e9140ef9cb..90d0cf9529 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -490,7 +490,7 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): self._value_column, ops.IsInOp(tuple(to_replace_list)) ) block, result_col = block.project_expr( - ops.where_op.as_expr("true_value", cond, self._value_column), self.name + ops.where_op.as_expr(ex.const(value), cond, self._value_column), self.name ) return Series(block.select_column(result_col)) From 8d8cf7e1a58bd26a0175013b225e3d5b0872a0cd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 00:40:06 +0000 Subject: [PATCH 4/5] mypy fix --- bigframes/core/compile/scalar_op_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index f0bfacadc4..115242e722 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -64,7 +64,7 @@ def _( expression: expressions.ScalarConstantExpression, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if pd.isnull(expression.value): + if pd.isnull(expression.value): # type: ignore return ibis.null() return ibis.literal(expression.value) From 573ef029e37dc58d34573987da2df0ff43cea559 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Jan 2024 17:53:55 +0000 Subject: [PATCH 5/5] fix df-df binop bug --- bigframes/dataframe.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1fa4c70ab6..1288117395 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -740,11 +740,9 @@ def _apply_dataframe_binop( ) # join columns schema # indexers will be none for exact match - columns, self_indexer, other_indexer = self.columns.join( + columns, lcol_indexer, rcol_indexer = self.columns.join( other.columns, how=how, return_indexers=True ) - lcol_indexer = self_indexer if not reverse else other_indexer - rcol_indexer = other_indexer if not reverse else self_indexer binop_result_ids = [] block = joined_index._block @@ -756,15 +754,27 @@ def _apply_dataframe_binop( for left_index, right_index in column_indices: if left_index >= 0 and right_index >= 0: # -1 indices indicate missing - left_col_id = self._block.value_columns[left_index] - right_col_id = other._block.value_columns[right_index] - expr = op.as_expr(left_col_id, right_col_id) + self_col_id = get_column_left[self._block.value_columns[left_index]] + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, self_col_id) + if reverse + else op.as_expr(self_col_id, other_col_id) + ) elif left_index >= 0: - left_col_id = self._block.value_columns[left_index] - expr = op.as_expr(left_col_id, ex.const(None)) + self_col_id = get_column_left[self._block.value_columns[left_index]] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) + ) elif right_index >= 0: - right_col_id = other._block.value_columns[right_index] - expr = op.as_expr(ex.const(None), right_col_id) + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, ex.const(None)) + if reverse + else op.as_expr(ex.const(None), other_col_id) + ) else: # Should not be possible raise ValueError("No right or left index.")