From 30f312d9c83561240003458bdd69866e28f40eda Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 11 Oct 2023 21:18:34 +0000 Subject: [PATCH 1/4] refactor: unify row operators to same interface --- bigframes/core/__init__.py | 15 +- bigframes/core/block_transforms.py | 4 +- bigframes/core/blocks.py | 12 +- bigframes/core/compile/compiled.py | 57 +- bigframes/core/compile/compiler.py | 20 +- bigframes/core/compile/scalar_op_compiler.py | 1065 ++++++++++++++ bigframes/core/indexes/index.py | 10 +- bigframes/core/nodes.py | 23 +- bigframes/dataframe.py | 28 +- bigframes/operations/__init__.py | 1296 ++++-------------- bigframes/operations/base.py | 4 +- bigframes/operations/strings.py | 50 +- bigframes/operations/structs.py | 19 +- bigframes/pandas/__init__.py | 4 +- bigframes/series.py | 35 +- 15 files changed, 1464 insertions(+), 1178 deletions(-) create mode 100644 bigframes/core/compile/scalar_op_compiler.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e19fec8f3f..e5853e30db 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -232,8 +232,8 @@ def project_unary_op( ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" return ArrayValue( - nodes.ProjectUnaryOpNode( - child=self.node, input_id=column_name, op=op, output_id=output_name + nodes.ProjectRowOpNode( + child=self.node, input_ids=(column_name,), op=op, output_id=output_name ) ) @@ -246,10 +246,9 @@ def project_binary_op( ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" return ArrayValue( - nodes.ProjectBinaryOpNode( + nodes.ProjectRowOpNode( child=self.node, - left_input_id=left_column_id, - right_input_id=right_column_id, + input_ids=(left_column_id, right_column_id), op=op, output_id=output_column_id, ) @@ -265,11 +264,9 @@ def project_ternary_op( ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" return ArrayValue( - nodes.ProjectTernaryOpNode( + nodes.ProjectRowOpNode( child=self.node, - input_id1=col_id_1, - input_id2=col_id_2, - input_id3=col_id_3, + input_ids=(col_id_1, col_id_2, col_id_3), op=op, output_id=output_column_id, ) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index ce0fdd219a..e486dd9bad 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -45,7 +45,7 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: lcolmapped = lmap[lcol] rcolmapped = rmap[rcol] joined_block, result_id = joined_block.apply_binary_op( - lcolmapped, rcolmapped, ops.eq_nulls_match_op + lcolmapped, rcolmapped, ops.eq_null_match_op ) joined_block, result_id = joined_block.apply_unary_op( result_id, ops.partial_right(ops.fillna_op, False) @@ -444,7 +444,7 @@ def rank( if method in ["min", "max", "first", "dense"]: # Pandas rank always produces Float64, so must cast for aggregation types that produce ints block = block.multi_apply_unary_op( - rownum_col_ids, ops.AsTypeOp(pd.Float64Dtype()) + rownum_col_ids, ops.AsTypeOp(to_type=pd.Float64Dtype()) ) if na_option == "keep": # For na_option "keep", null inputs must produce null outputs diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 34913872e7..a66a857cff 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -566,12 +566,12 @@ def _split( # Create an ordering col and convert to string block, ordering_col = block.promote_offsets() block, string_ordering_col = block.apply_unary_op( - ordering_col, ops.AsTypeOp("string[pyarrow]") + ordering_col, ops.AsTypeOp(to_type="string[pyarrow]") ) # Apply hash method to sum col and order by it. block, string_sum_col = block.apply_binary_op( - string_ordering_col, random_state_col, ops.concat_op + string_ordering_col, random_state_col, ops.strconcat_op ) block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) block = block.order_by([ordering.OrderingColumnReference(hash_string_sum_col)]) @@ -1237,8 +1237,8 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp("string")) - prefix_op = ops.BinopPartialLeft(ops.add_op, prefix) + expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) expr = expr.project_unary_op(index_col, prefix_op) return Block( expr, @@ -1256,8 +1256,8 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp("string")) - prefix_op = ops.BinopPartialRight(ops.add_op, suffix) + expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) + prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) expr = expr.project_unary_op(index_col, prefix_op) return Block( expr, diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 461c2c005a..c1011a0fd5 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,6 +26,7 @@ import pandas import bigframes.constants as constants +import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -42,8 +43,11 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" + T = typing.TypeVar("T", bound="BaseIbisIR") +op_compiler = op_compilers.scalar_op_compiler + class BaseIbisIR(abc.ABC): """Implementation detail, contains common logic between ordered and unordered IR""" @@ -137,49 +141,20 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_unary_op( + def project_row_op( self: T, - input_column_id: str, - op: ops.UnaryOp, + input_column_ids: typing.Sequence[str], + op: ops.RowOp, output_column_id: typing.Optional[str] = None, ) -> T: """Creates a new expression based on this expression with unary operation applied to one column.""" result_id = ( - output_column_id or input_column_id + output_column_id or input_column_ids[0] ) # overwrite input if not output id provided - value = op._as_ibis(self._get_ibis_column(input_column_id)).name(result_id) + inputs = tuple(self._get_ibis_column(col) for col in input_column_ids) + value = op_compiler.compile_row_op(op, inputs).name(result_id) return self._set_or_replace_by_id(result_id, value) - def project_binary_op( - self: T, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> T: - """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - - def project_ternary_op( - self: T, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, - ) -> T: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) - def assign(self: T, source_id: str, destination_id: str) -> T: return self._set_or_replace_by_id( destination_id, self._get_ibis_column(source_id) @@ -423,7 +398,9 @@ def unpivot( None, force_dtype=col_dtype ) ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + op_compiler.compile_row_op( + ops.AsTypeOp(col_dtype), (unpivot_table[col],) + ) if col is not None else null_value for col in source_cols @@ -490,9 +467,7 @@ def aggregate( expr = OrderedIR(result, columns=columns, ordering=ordering) if dropna: for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) + expr = expr._filter(expr._get_ibis_column(column_id).notnull()) # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation return expr._project_offsets() else: @@ -942,7 +917,9 @@ def unpivot( None, force_dtype=col_dtype ) ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + op_compiler.compile_row_op( + ops.AsTypeOp(col_dtype), (unpivot_table[col],) + ) if col is not None else null_value for col in source_cols diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 39892635f1..44809a7abd 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -143,23 +143,9 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register -def compile_project_unary(node: nodes.ProjectUnaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_unary_op( - node.input_id, node.op, node.output_id - ) - - -@_compile_node.register -def compile_project_binary(node: nodes.ProjectBinaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_binary_op( - node.left_input_id, node.right_input_id, node.op, node.output_id - ) - - -@_compile_node.register -def compile_project_ternary(node: nodes.ProjectTernaryOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_ternary_op( - node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id +def compile_project(node: nodes.ProjectRowOpNode, ordered: bool = True): + return compile_node(node.child, ordered).project_row_op( + node.input_ids, node.op, node.output_id ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py new file mode 100644 index 0000000000..669af0ac09 --- /dev/null +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -0,0 +1,1065 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +import typing + +import ibis +import ibis.common.exceptions +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations.generic +import ibis.expr.types as ibis_types +import numpy as np +import pandas as pd + +import bigframes.constants as constants +import bigframes.dtypes +import bigframes.dtypes as dtypes +import bigframes.operations as ops + +_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) +_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) +_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) +_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) + + +class ScalarOpCompiler: + # Mapping of operation name to implemenations + _registry: dict[ + str, + typing.Callable[ + [typing.Sequence[ibis_types.Value], ops.RowOp], ibis_types.Value + ], + ] = {} + + def compile_row_op( + self, op: ops.RowOp, inputs: typing.Sequence[ibis_types.Value] + ) -> ibis_types.Value: + impl = self._registry[op.name] + return impl(inputs, op) + + def register_unary_op( + self, + op_ref: typing.Union[ops.UnaryOp, type[ops.UnaryOp]], + pass_op: bool = False, + ): + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + if pass_op: + return impl(args[0], op) + else: + return impl(args[0]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def register_binary_op( + self, + op_ref: typing.Union[ops.BinaryOp, type[ops.BinaryOp]], + pass_op: bool = False, + ): + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + if pass_op: + return impl(args[0], args[1], op) + else: + return impl(args[0], args[1]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def register_ternary_op( + self, op_ref: typing.Union[ops.TernaryOp, type[ops.TernaryOp]] + ): + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + return impl(args[0], args[1], args[2]) + + self._register(key, normalized_impl) + return impl + + return decorator + + def _register( + self, + op_name: str, + impl: typing.Callable[ + [typing.Sequence[ibis_types.Value], ops.RowOp], ibis_types.Value + ], + ): + if op_name in self._registry: + raise ValueError(f"Operation name {op_name} already registered") + self._registry[op_name] = impl + + +# Singleton compiler +scalar_op_compiler = ScalarOpCompiler() + + +### Unary Ops +@scalar_op_compiler.register_unary_op(ops.isnull_op) +def isnull_op_impl(x: ibis_types.Value): + return x.isnull() + + +@scalar_op_compiler.register_unary_op(ops.notnull_op) +def notnull_op_impl(x: ibis_types.Value): + return x.notnull() + + +@scalar_op_compiler.register_unary_op(ops.hash_op) +def hash_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.IntegerValue, x).hash() + + +# Trig Functions +@scalar_op_compiler.register_unary_op(ops.sin_op) +def sin_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).sin() + + +@scalar_op_compiler.register_unary_op(ops.cos_op) +def cos_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).cos() + + +@scalar_op_compiler.register_unary_op(ops.tan_op) +def tan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).tan() + + +# Inverse trig functions +@scalar_op_compiler.register_unary_op(ops.arcsin_op) +def arcsin_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.asin()) + + +@scalar_op_compiler.register_unary_op(ops.arccos_op) +def arccos_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.acos()) + + +@scalar_op_compiler.register_unary_op(ops.arctan_op) +def arctan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan() + + +# Hyperbolic trig functions +# BQ has these functions, but Ibis doesn't +@scalar_op_compiler.register_unary_op(ops.sinh_op) +def sinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sinh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) + + +@scalar_op_compiler.register_unary_op(ops.cosh_op) +def cosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + cosh_result = (numeric_value.exp() + (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, cosh_result) + + +@scalar_op_compiler.register_unary_op(ops.tanh_op) +def tanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) + # Beyond +-20, is effectively just the sign function + domain = numeric_value.abs() < _ibis_num(20) + return (~domain).ifelse(numeric_value.sign(), tanh_result) + + +@scalar_op_compiler.register_unary_op(ops.arcsinh_op) +def arcsinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() + return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() + + +@scalar_op_compiler.register_unary_op(ops.arccosh_op) +def arccosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() + acosh_result = (numeric_value + sqrt_part).ln() + domain = numeric_value >= _ibis_num(1) + return (~domain).ifelse(_NAN, acosh_result) + + +@scalar_op_compiler.register_unary_op(ops.arctanh_op) +def arctanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() < _ibis_num(1) + numerator = numeric_value + _ibis_num(1) + denominator = _ibis_num(1) - numeric_value + ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) + atanh_result = ln_input.ln().div(2) + + out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( + _INF * numeric_value, _NAN + ) + + return (~domain).ifelse(out_of_domain, atanh_result) + + +# Numeric Ops +@scalar_op_compiler.register_unary_op(ops.abs_op) +def abs_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).abs() + + +@scalar_op_compiler.register_unary_op(ops.sqrt_op) +def sqrt_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value >= _ZERO + return (~domain).ifelse(_NAN, numeric_value.sqrt()) + + +@scalar_op_compiler.register_unary_op(ops.log10_op) +def log10_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.log10()) + + +@scalar_op_compiler.register_unary_op(ops.ln_op) +def ln_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.ln()) + + +@scalar_op_compiler.register_unary_op(ops.exp_op) +def exp_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, numeric_value.exp()) + + +@scalar_op_compiler.register_unary_op(ops.invert_op) +def invert_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).negate() + + +## String Operation +@scalar_op_compiler.register_unary_op(ops.len_op) +def len_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.reverse_op) +def reverse_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).reverse() + + +@scalar_op_compiler.register_unary_op(ops.lower_op) +def lower_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).lower() + + +@scalar_op_compiler.register_unary_op(ops.upper_op) +def upper_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).upper() + + +@scalar_op_compiler.register_unary_op(ops.strip_op) +def strip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).strip() + + +@scalar_op_compiler.register_unary_op(ops.isnumeric_op) +def isnumeric_op_impl(x: ibis_types.Value): + # catches all members of the Unicode number class, which matches pandas isnumeric + # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains + # TODO: Validate correctness, my miss eg ⅕ character + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") + + +@scalar_op_compiler.register_unary_op(ops.isalpha_op) +def isalpha_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isdigit_op) +def isdigit_op_impl(x: ibis_types.Value): + # Based on docs, should include superscript/subscript-ed numbers + # Tests however pass only when set to Nd unicode class + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isdecimal_op) +def isdecimal_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isalnum_op) +def isalnum_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isspace_op) +def isspace_op_impl(x: ibis_types.Value): + # All characters are whitespace characters, False for empty string + return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") + + +@scalar_op_compiler.register_unary_op(ops.islower_op) +def islower_op_impl(x: ibis_types.Value): + # No upper case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Lu}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.isupper_op) +def isupper_op_impl(x: ibis_types.Value): + # No lower case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Ll}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.rstrip_op) +def rstrip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).rstrip() + + +@scalar_op_compiler.register_unary_op(ops.lstrip_op) +def lstrip_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).lstrip() + + +@scalar_op_compiler.register_unary_op(ops.capitalize_op) +def capitalize_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).capitalize() + + +@scalar_op_compiler.register_unary_op(ops.StrContainsOp, pass_op=True) +def strcontains_op(x: ibis_types.Value, op: ops.StrContainsOp): + return typing.cast(ibis_types.StringValue, x).contains(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrContainsRegexOp, pass_op=True) +def contains_regex_op_impl(x: ibis_types.Value, op: ops.StrContainsRegexOp): + return typing.cast(ibis_types.StringValue, x).re_search(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrGetOp, pass_op=True) +def strget_op_impl(x: ibis_types.Value, op: ops.StrGetOp): + substr = typing.cast( + ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[op.i] + ) + return substr.nullif(ibis_types.literal("")) + + +@scalar_op_compiler.register_unary_op(ops.StrPadOp, pass_op=True) +def strpad_op_impl(x: ibis_types.Value, op: ops.StrPadOp): + str_val = typing.cast(ibis_types.StringValue, x) + + # SQL pad operations will truncate, we do not want to truncate though. + pad_length = ibis.greatest(str_val.length(), op.length) + if op.side == "left": + return str_val.lpad(pad_length, op.fillchar) + elif op.side == "right": + return str_val.rpad(pad_length, op.fillchar) + else: # side == both + # Pad more on right side if can't pad both sides equally + lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + return str_val.lpad(lpad_amount, op.fillchar).rpad(pad_length, op.fillchar) + + +@scalar_op_compiler.register_unary_op(ops.ReplaceStrOp, pass_op=True) +def replacestring_op_impl(x: ibis_types.Value, op: ops.ReplaceStrOp): + pat_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.pat)) + repl_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.repl)) + return typing.cast(ibis_types.StringValue, x).replace(pat_str_value, repl_str_value) + + +@scalar_op_compiler.register_unary_op(ops.RegexReplaceStrOp, pass_op=True) +def replaceregex_op_impl(x: ibis_types.Value, op: ops.RegexReplaceStrOp): + return typing.cast(ibis_types.StringValue, x).re_replace(op.pat, op.repl) + + +@scalar_op_compiler.register_unary_op(ops.StartsWithOp, pass_op=True) +def startswith_op_impl(x: ibis_types.Value, op: ops.StartsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.EndsWithOp, pass_op=True) +def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) +def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): + str_value = typing.cast(ibis_types.StringValue, x) + return ( + ibis.case() + .when( + str_value[0] == "-", + "-" + + strpad_op_impl( + str_value.substr(1), + ops.StrPadOp(length=op.width - 1, fillchar="0", side="left"), + ), + ) + .else_( + strpad_op_impl( + str_value, ops.StrPadOp(length=op.width, fillchar="0", side="left") + ) + ) + .end() + ) + + +@scalar_op_compiler.register_unary_op(ops.StrFindOp, pass_op=True) +def find_op_impl(x: ibis_types.Value, op: ops.StrFindOp): + return typing.cast(ibis_types.StringValue, x).find(op.substr, op.start, op.end) + + +@scalar_op_compiler.register_unary_op(ops.StrExtractOp, pass_op=True) +def extract_op_impl(x: ibis_types.Value, op: ops.StrExtractOp): + return typing.cast(ibis_types.StringValue, x).re_extract(op.pat, op.n) + + +@scalar_op_compiler.register_unary_op(ops.StrSliceOp, pass_op=True) +def slice_op_impl(x: ibis_types.Value, op: ops.StrSliceOp): + return typing.cast(ibis_types.StringValue, x)[op.start : op.end] + + +@scalar_op_compiler.register_unary_op(ops.StrRepeatOp, pass_op=True) +def repeat_op_impl(x: ibis_types.Value, op: ops.StrRepeatOp): + return typing.cast(ibis_types.StringValue, x).repeat(op.repeats) + + +## Datetime Ops +@scalar_op_compiler.register_unary_op(ops.day_op) +def day_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.date_op) +def date_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).date() + + +@scalar_op_compiler.register_unary_op(ops.dayofweek_op) +def dayofweek_op_impl(x: ibis_types.Value): + return ( + typing.cast(ibis_types.TimestampValue, x) + .day_of_week.index() + .cast(ibis_dtypes.int64) + ) + + +@scalar_op_compiler.register_unary_op(ops.hour_op) +def hour_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.minute_op) +def minute_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.month_op) +def month_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.quarter_op) +def quarter_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.second_op) +def second_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.time_op) +def time_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).time() + + +@scalar_op_compiler.register_unary_op(ops.year_op) +def year_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) + + +# Parameterized ops +@scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) +def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(op.name_or_index, str): + name = op.name_or_index + else: + name = struct_value.names[op.name_or_index] + return struct_value[name].name(name) + + +@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) +def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): + to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type) + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(to_type) + return bigframes.dtypes.cast_ibis_value(x, to_type) + + +@scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) +def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): + contains_nulls = any(is_null(value) for value in op.values) + matchable_ibis_values = [] + for item in op.values: + if not is_null(item): + try: + # we want values that *could* be cast to the dtype, but we don't want + # to actually cast it, as that could be lossy (eg float -> int) + item_inferred_type = ibis.literal(item).type() + if ( + x.type() == item_inferred_type + or x.type().is_numeric() + and item_inferred_type.is_numeric() + ): + matchable_ibis_values.append(item) + except TypeError: + pass + + if op.match_nulls and contains_nulls: + return x.isnull() | x.isin(matchable_ibis_values) + else: + return x.isin(matchable_ibis_values) + + +@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) +def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): + if not hasattr(op.func, "bigframes_remote_function"): + raise TypeError( + f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" + ) + x_transformed = op.func(x) + if not op.apply_on_null: + x_transformed = ibis.case().when(x.isnull(), x).else_(x_transformed).end() + return x_transformed + + +### Binary Ops +def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): + """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" + + def short_circuit_nulls_inner(binop): + @functools.wraps(binop) + def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or y.type()) + elif isinstance(y, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or x.type()) + else: + return binop(x, y) + + return wrapped_binop + + return short_circuit_nulls_inner + + +@scalar_op_compiler.register_binary_op(ops.strconcat_op) +def concat_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_string = typing.cast(ibis_types.StringValue, x) + y_string = typing.cast(ibis_types.StringValue, y) + return x_string.concat(y_string) + + +@scalar_op_compiler.register_binary_op(ops.eq_op) +def eq_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x == y + + +@scalar_op_compiler.register_binary_op(ops.eq_null_match_op) +def eq_nulls_match_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) + return left == right + + +@scalar_op_compiler.register_binary_op(ops.ne_op) +def ne_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x != y + + +@scalar_op_compiler.register_binary_op(ops.and_op) +def and_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.or_op) +def or_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.add_op) +@short_circuit_nulls() +def add_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): + return + return typing.cast(ibis_types.NumericValue, x) + typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.sub_op) +@short_circuit_nulls() +def sub_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) - typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.mul_op) +@short_circuit_nulls() +def mul_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) * typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.div_op) +@short_circuit_nulls(ibis_dtypes.float) +def div_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) / typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_integer() and y.type().is_integer(): + return _int_pow_op(x, y) + else: + return _float_pow_op(x, y) + + +@scalar_op_compiler.register_binary_op(ops.unsafe_pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def unsafe_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """For internal use only - where domain and overflow checks are not needed.""" + return typing.cast(ibis_types.NumericValue, x) ** typing.cast( + ibis_types.NumericValue, y + ) + + +def _int_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Need to avoid any error cases - should produce NaN instead + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_as_decimal = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), + ) + y_val = typing.cast(ibis_types.NumericValue, y) + + # BQ POW() function outputs FLOAT64, which can lose precision. + # Therefore, we do math in NUMERIC and cast back down after. + # Also, explicit bounds checks, pandas will silently overflow. + pow_result = x_as_decimal**y_val + overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( + pow_result < _ibis_num(-(2**63)) + ) + + return ( + ibis.case() + .when((overflow_cond), ibis.null()) + .else_(pow_result.cast(ibis_dtypes.int64)) + .end() + ) + + +def _float_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_val = typing.cast(ibis_types.NumericValue, x) + y_val = typing.cast(ibis_types.NumericValue, y) + + overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) + + # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity + exp_too_big = y_val.abs() > _ibis_num(2**53) + # Treat very large exponents as +=INF + norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) + + pow_result = x_val**norm_exp + + # This cast is dangerous, need to only excuted where y_val has been bounds-checked + # Ibis needs try_cast binding to bq safe_cast + exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val + odd_exponent = (x_val < _ZERO) & ( + y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) + ) + infinite_base = x_val.abs() == _INF + + return ( + ibis.case() + # Might be able to do something more clever with x_val==0 case + .when(y_val == _ZERO, _ibis_num(1)) + .when( + x_val == _ibis_num(1), _ibis_num(1) + ) # Need to ignore exponent, even if it is NA + .when( + (x_val == _ZERO) & (y_val < _ZERO), _INF + ) # This case would error POW function in BQ + .when(infinite_base, pow_result) + .when( + exp_too_big, pow_result + ) # Bigquery can actually handle the +-inf cases gracefully + .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) + .when( + overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) + ) # finite overflows would cause bq to error + .else_(pow_result) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.lt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def lt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x < y + + +@scalar_op_compiler.register_binary_op(ops.le_op) +@short_circuit_nulls(ibis_dtypes.bool) +def le_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x <= y + + +@scalar_op_compiler.register_binary_op(ops.gt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def gt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x > y + + +@scalar_op_compiler.register_binary_op(ops.ge_op) +@short_circuit_nulls(ibis_dtypes.bool) +def ge_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x >= y + + +@scalar_op_compiler.register_binary_op(ops.floordiv_op) +@short_circuit_nulls(ibis_dtypes.int) +def floordiv_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_numeric = typing.cast(ibis_types.NumericValue, x) + y_numeric = typing.cast(ibis_types.NumericValue, y) + floordiv_expr = x_numeric // y_numeric + + # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. + # Multiplying left by zero propogates nulls. + zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO + return ( + ibis.case() + .when(y_numeric == _ZERO, zero_result * x_numeric) + .else_(floordiv_expr) + .end() + ) + + +def _is_float(x: ibis_types.Value): + return isinstance(x, (ibis_types.FloatingColumn, ibis_types.FloatingScalar)) + + +@scalar_op_compiler.register_binary_op(ops.mod_op) +@short_circuit_nulls() +def mod_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + is_result_float = _is_float(x) | _is_float(y) + x_numeric = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else x, + ) + y_numeric = typing.cast( + ibis_types.NumericValue, + y.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) + if is_result_float + else y, + ) + # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. + op = y.op() + if isinstance(op, ibis.expr.operations.generic.Literal) and op.value == 0: + return ibis_types.null().cast(x.type()) + + bq_mod = x_numeric % y_numeric # Bigquery will maintain x sign here + if is_result_float: + bq_mod = typing.cast(ibis_types.NumericValue, bq_mod.cast(ibis_dtypes.float64)) + + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) + return ( + ibis.case() + .when( + y_numeric == _ZERO, + _NAN * x_numeric if is_result_float else _ZERO * x_numeric, + ) # Dummy op to propogate nulls and type from x arg + .when( + (y_numeric < _ZERO) & (bq_mod > _ZERO), (y_numeric + bq_mod) + ) # Convert positive result to negative + .when( + (y_numeric > _ZERO) & (bq_mod < _ZERO), (y_numeric + bq_mod) + ) # Convert negative result to positive + .else_(bq_mod) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.fillna_op) +def fillna_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return x.fillna(typing.cast(ibis_types.Scalar, y)) + + +@scalar_op_compiler.register_binary_op(ops.round_op) +def round_op(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).round( + digits=typing.cast(ibis_types.IntegerValue, y) + ) + + +@scalar_op_compiler.register_binary_op(ops.coalesce_op) +def coalesce_impl( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.name("name").equals(y.name("name")): + return x + else: + return ibis.coalesce(x, y) + + +@scalar_op_compiler.register_binary_op(ops.cliplower_op) +def clip_lower( + value: ibis_types.Value, + lower: ibis_types.Value, +): + return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end() + + +@scalar_op_compiler.register_binary_op(ops.clipupper_op) +def clip_upper( + value: ibis_types.Value, + upper: ibis_types.Value, +): + return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end() + + +# Ternary Operations +@scalar_op_compiler.register_ternary_op(ops.where_op) +def where_op( + original: ibis_types.Value, + condition: ibis_types.Value, + replacement: ibis_types.Value, +) -> ibis_types.Value: + """Returns x if y is true, otherwise returns z.""" + return ibis.case().when(condition, original).else_(replacement).end() + + +@scalar_op_compiler.register_ternary_op(ops.clip_op) +def clip_op( + original: ibis_types.Value, + lower: ibis_types.Value, + upper: ibis_types.Value, +) -> ibis_types.Value: + """Clips value to lower and upper bounds.""" + if isinstance(lower, ibis_types.NullScalar) and ( + not isinstance(upper, ibis_types.NullScalar) + ): + return ( + ibis.case() + .when(upper.isnull() | (original > upper), upper) + .else_(original) + .end() + ) + elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( + upper, ibis_types.NullScalar + ): + return ( + ibis.case() + .when(lower.isnull() | (original < lower), lower) + .else_(original) + .end() + ) + elif isinstance(lower, ibis_types.NullScalar) and ( + isinstance(upper, ibis_types.NullScalar) + ): + return original + else: + # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + return ( + ibis.case() + .when(lower.isnull() | (original < lower), lower) + .when(upper.isnull() | (original > upper), upper) + .else_(original) + .end() + ) + + +# Composition Ops +@scalar_op_compiler.register_unary_op(ops.ApplyRight, pass_op=True) +def apply_right(input: ibis_types.Value, op: ops.ApplyRight): + right = dtypes.literal_to_ibis_scalar(op.right_scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (input, right)) + + +@scalar_op_compiler.register_unary_op(ops.ApplyLeft, pass_op=True) +def apply_left(input: ibis_types.Value, op: ops.ApplyLeft): + left = dtypes.literal_to_ibis_scalar(op.left_scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (left, input)) + + +@scalar_op_compiler.register_binary_op(ops.ReverseArgsOp, pass_op=True) +def apply_reversed( + input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ReverseArgsOp +): + return scalar_op_compiler.compile_row_op(op.base_op, (input2, input1)) + + +@scalar_op_compiler.register_binary_op(ops.ApplyArg1, pass_op=True) +def apply_arg1(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg1): + arg1 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (arg1, input1, input2)) + + +@scalar_op_compiler.register_binary_op(ops.ApplyArg3, pass_op=True) +def apply_arg3(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg3): + arg3 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) + return scalar_op_compiler.compile_row_op(op.base_op, (input1, input2, arg3)) + + +# Helpers +def is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index fc7cf167d4..4163ca909c 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -186,7 +186,7 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_op(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def all(self) -> bool: if self.nlevels > 1: @@ -278,7 +278,7 @@ def drop( level_id = self._block.index_columns[0] if utils.is_list_like(labels): block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(labels, match_nulls=True) + level_id, ops.IsInOp(values=tuple(labels), match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op @@ -308,9 +308,9 @@ def isin(self, values) -> Index: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) def _apply_unary_op( self, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 44a8d808ff..1014acdbea 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -159,29 +159,12 @@ class SelectNode(UnaryNode): @dataclass(frozen=True) -class ProjectUnaryOpNode(UnaryNode): - input_id: str - op: ops.UnaryOp +class ProjectRowOpNode(UnaryNode): + input_ids: typing.Tuple[str, ...] + op: ops.RowOp output_id: Optional[str] = None -@dataclass(frozen=True) -class ProjectBinaryOpNode(UnaryNode): - left_input_id: str - right_input_id: str - op: ops.BinaryOp - output_id: str - - -@dataclass(frozen=True) -class ProjectTernaryOpNode(UnaryNode): - input_id1: str - input_id2: str - input_id3: str - op: ops.TernaryOp - output_id: str - - @dataclass(frozen=True) class AggregateNode(UnaryNode): aggregations: typing.Tuple[typing.Tuple[str, agg_ops.AggregateOp, str], ...] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3b0fd7008a..67dd7aacee 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -153,7 +153,7 @@ def __init__( block = block.select_columns(list(columns)) # type:ignore if dtype: block = block.multi_apply_unary_op( - block.value_columns, ops.AsTypeOp(dtype) + block.value_columns, ops.AsTypeOp(to_type=dtype) ) self._block = block @@ -315,7 +315,7 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> DataFrame: - return self._apply_unary_op(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def _to_sql_query( self, include_index: bool @@ -637,7 +637,7 @@ def _apply_binop( def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: block = self._block - partial_op = ops.BinopPartialRight(op, other) + partial_op = ops.ApplyRight(base_op=op, right_scalar=other) for column_id, label in zip( self._block.value_columns, self._block.column_labels ): @@ -1062,7 +1062,7 @@ def drop( if utils.is_list_like(index): block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(index, match_nulls=True) + level_id, ops.IsInOp(values=tuple(index), match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op @@ -1422,16 +1422,16 @@ def _filter_rows( block = self._block block, label_string_id = block.apply_unary_op( self._block.index_columns[0], - ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ops.AsTypeOp(to_type=pandas.StringDtype(storage="pyarrow")), ) if like is not None: block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsStringOp(pat=like) + label_string_id, ops.StrContainsOp(pat=like) ) else: # regex assert regex is not None block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsRegexOp(pat=regex) + label_string_id, ops.StrContainsRegexOp(pat=regex) ) block = block.filter(mask_id) @@ -1441,7 +1441,7 @@ def _filter_rows( # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block block, mask_id = block.apply_unary_op( - self._block.index_columns[0], ops.IsInOp(values=list(items)) + self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) block = block.filter(mask_id) block = block.select_columns(self._block.value_columns) @@ -1577,7 +1577,9 @@ def isin(self, values) -> DataFrame: if label in values.keys(): value_for_key = values[label] block, result_id = block.apply_unary_op( - col, ops.IsInOp(value_for_key, match_nulls=True), label + col, + ops.IsInOp(values=tuple(value_for_key), match_nulls=True), + label, ) result_ids.append(result_id) else: @@ -1587,9 +1589,9 @@ def isin(self, values) -> DataFrame: result_ids.append(result_id) return DataFrame(block.select_columns(result_ids)).fillna(value=False) elif utils.is_list_like(values): - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) else: raise TypeError( "only list-like objects are allowed to be passed to " @@ -2739,7 +2741,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # inputs causing errors. reprojected_df = DataFrame(self._block._force_reproject()) return reprojected_df._apply_unary_op( - ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) + ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None)) ) def apply(self, func, *, args: typing.Tuple = (), **kwargs): diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a29dd36c72..a1c7569c96 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -14,1023 +14,373 @@ from __future__ import annotations -import functools +import dataclasses import typing -import ibis -import ibis.common.exceptions -import ibis.expr.datatypes as ibis_dtypes -import ibis.expr.operations.generic -import ibis.expr.types as ibis_types import numpy as np -import pandas as pd -import bigframes.constants as constants -import bigframes.dtypes import bigframes.dtypes as dtypes -_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) -_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) -_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) -_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) -# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result -# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) -# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. -_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) -_INT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(43.6)) +class RowOp(typing.Protocol): + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] -TernaryOp = typing.Callable[ - [ibis_types.Value, ibis_types.Value, ibis_types.Value], ibis_types.Value -] + @property + def arguments(self) -> int: + """The number of column argument the operation takes""" + raise NotImplementedError("RowOp abstract base class has no implementation") -### Unary Ops +# These classes can be used to create simple ops that don't take local parameters +# All is needed is a unique name, and to register an implementation in ibis_mappings.py +@dataclasses.dataclass(frozen=True) class UnaryOp: - def _as_ibis(self, x): - raise NotImplementedError( - f"Base class UnaryOp has no implementation. {constants.FEEDBACK_LINK}" - ) - @property - def is_windowed(self): - return False - - -# Trig Functions -class AbsOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).abs() - - -class SinOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).sin() - - -class CosOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).cos() - - -class TanOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).tan() - - -# Inverse trig functions -class ArcsinOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.asin()) - - -class ArccosOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.acos()) - - -class ArctanOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).atan() - - -# Hyperbolic trig functions -# BQ has these functions, but Ibis doesn't -class SinhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sinh_result = ( - numeric_value.exp() - (numeric_value.negate()).exp() - ) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) - - -class CoshOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - cosh_result = ( - numeric_value.exp() + (numeric_value.negate()).exp() - ) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, cosh_result) - - -class TanhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( - numeric_value.exp() + (numeric_value.negate()).exp() - ) - # Beyond +-20, is effectively just the sign function - domain = numeric_value.abs() < _ibis_num(20) - return (~domain).ifelse(numeric_value.sign(), tanh_result) - - -class ArcsinhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() - return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() - - -class ArccoshOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() - acosh_result = (numeric_value + sqrt_part).ln() - domain = numeric_value >= _ibis_num(1) - return (~domain).ifelse(_NAN, acosh_result) - - -class ArctanhOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() < _ibis_num(1) - numerator = numeric_value + _ibis_num(1) - denominator = _ibis_num(1) - numeric_value - ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) - atanh_result = ln_input.ln().div(2) - - out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( - _INF * numeric_value, _NAN - ) - - return (~domain).ifelse(out_of_domain, atanh_result) - - -class SqrtOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value >= _ZERO - return (~domain).ifelse(_NAN, numeric_value.sqrt()) - - -class Log10Op(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.log10()) - - -class LnOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.ln()) - - -class ExpOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, numeric_value.exp()) - - -class InvertOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).negate() - - -class IsNullOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return x.isnull() - - -class LenOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) - - -class NotNullOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return x.notnull() - - -class HashOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.IntegerValue, x).hash() - - -## String Operation -class ReverseOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).reverse() - - -class LowerOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lower() - - -class UpperOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).upper() - - -class StripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).strip() - - -class IsNumericOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # catches all members of the Unicode number class, which matches pandas isnumeric - # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains - # TODO: Validate correctness, my miss eg ⅕ character - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") - - -class IsAlphaOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -class IsDigitOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # Based on docs, should include superscript/subscript-ed numbers - # Tests however pass only when set to Nd unicode class - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -class IsDecimalOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -class IsAlnumOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -class IsSpaceOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # All characters are whitespace characters, False for empty string - return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") - - -class IsLowerOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # No upper case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search( - r"\p{Ll}" - ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}|\p{Lt}") - - -class IsUpperOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - # No lower case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search( - r"\p{Lu}" - ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}|\p{Lt}") - - -class RstripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).rstrip() - - -class LstripOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lstrip() + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") + @property + def arguments(self) -> int: + return 1 -class CapitalizeOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).capitalize() +@dataclasses.dataclass(frozen=True) +class BinaryOp: + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -class ContainsStringOp(UnaryOp): - def __init__(self, pat: str, case: bool = True): - self._pat = pat + @property + def arguments(self) -> int: + return 2 - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).contains(self._pat) +@dataclasses.dataclass(frozen=True) +class TernaryOp: + @property + def name(self) -> str: + raise NotImplementedError("RowOp abstract base class has no implementation") -class ContainsRegexOp(UnaryOp): - def __init__(self, pat: str): - self._pat = pat + @property + def arguments(self) -> int: + return 3 + + +# Operation Factories +def create_unary_op(name: str) -> UnaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(UnaryOp,), + frozen=True, + )() + + +def create_binary_op(name: str) -> BinaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(BinaryOp,), + frozen=True, + )() + + +def create_ternary_op(name: str) -> TernaryOp: + return dataclasses.make_dataclass( + name, + [("name", typing.ClassVar[str], name)], # type: ignore + bases=(TernaryOp,), + frozen=True, + )() + + +# Unary Ops +## Generic Ops +invert_op = create_unary_op(name="invert") +isnull_op = create_unary_op(name="isnull") +notnull_op = create_unary_op(name="notnull") +hash_op = create_unary_op(name="hash") +## String Ops +len_op = create_unary_op(name="len") +reverse_op = create_unary_op(name="reverse") +lower_op = create_unary_op(name="lower") +upper_op = create_unary_op(name="upper") +strip_op = create_unary_op(name="strip") +isalnum_op = create_unary_op(name="isalnum") +isalpha_op = create_unary_op(name="isalpha") +isdecimal_op = create_unary_op(name="isdecimal") +isdigit_op = create_unary_op(name="isdigit") +isnumeric_op = create_unary_op(name="isnumeric") +isspace_op = create_unary_op(name="isspace") +islower_op = create_unary_op(name="islower") +isupper_op = create_unary_op(name="isupper") +rstrip_op = create_unary_op(name="rstrip") +lstrip_op = create_unary_op(name="lstrip") +capitalize_op = create_unary_op(name="capitalize") +## DateTime Ops +day_op = create_unary_op(name="day") +dayofweek_op = create_unary_op(name="dayofweek") +date_op = create_unary_op(name="date") +hour_op = create_unary_op(name="hour") +minute_op = create_unary_op(name="minute") +month_op = create_unary_op(name="month") +quarter_op = create_unary_op(name="quarter") +second_op = create_unary_op(name="second") +time_op = create_unary_op(name="time") +year_op = create_unary_op(name="year") +## Trigonometry Ops +sin_op = create_unary_op(name="sin") +cos_op = create_unary_op(name="cos") +tan_op = create_unary_op(name="tan") +arcsin_op = create_unary_op(name="arcsin") +arccos_op = create_unary_op(name="arccos") +arctan_op = create_unary_op(name="arctan") +sinh_op = create_unary_op(name="sinh") +cosh_op = create_unary_op(name="cosh") +tanh_op = create_unary_op(name="tanh") +arcsinh_op = create_unary_op(name="arcsinh") +arccosh_op = create_unary_op(name="arccosh") +arctanh_op = create_unary_op(name="arctanh") +## Numeric Ops +abs_op = create_unary_op(name="abs") +exp_op = create_unary_op(name="exp") +ln_op = create_unary_op(name="log") +log10_op = create_unary_op(name="log10") +sqrt_op = create_unary_op(name="sqrt") + + +# Parameterized unary ops +@dataclasses.dataclass(frozen=True) +class StrContainsOp(UnaryOp): + name: typing.ClassVar[str] = "str_contains" + pat: str + + +@dataclasses.dataclass(frozen=True) +class StrContainsRegexOp(UnaryOp): + name: typing.ClassVar[str] = "str_contains_regex" + pat: str + + +@dataclasses.dataclass(frozen=True) +class StrGetOp(UnaryOp): + name: typing.ClassVar[str] = "str_get" + i: int - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search(self._pat) +@dataclasses.dataclass(frozen=True) +class StrPadOp(UnaryOp): + name: typing.ClassVar[str] = "str_pad" + length: int + fillchar: str + side: typing.Literal["both", "left", "right"] -class StrGetOp(UnaryOp): - def __init__(self, i: int): - self._i = i - def _as_ibis(self, x: ibis_types.Value): - substr = typing.cast( - ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[self._i] - ) - return substr.nullif(ibis_types.literal("")) +@dataclasses.dataclass(frozen=True) +class ReplaceStrOp(UnaryOp): + name: typing.ClassVar[str] = "str_replace" + pat: str + repl: str -class StrPadOp(UnaryOp): - def __init__( - self, length: int, fillchar: str, side: typing.Literal["both", "left", "right"] - ): - self._length = length - self._fillchar = fillchar - self._side = side - - def _as_ibis(self, x: ibis_types.Value): - str_val = typing.cast(ibis_types.StringValue, x) - - # SQL pad operations will truncate, we do not want to truncate though. - pad_length = ibis.greatest(str_val.length(), self._length) - if self._side == "left": - return str_val.lpad(pad_length, self._fillchar) - elif self._side == "right": - return str_val.rpad(pad_length, self._fillchar) - else: # side == both - # Pad more on right side if can't pad both sides equally - lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() - return str_val.lpad(lpad_amount, self._fillchar).rpad( - pad_length, self._fillchar - ) - - -class ReplaceStringOp(UnaryOp): - def __init__(self, pat: str, repl: str): - self._pat = pat - self._repl = repl - - def _as_ibis(self, x: ibis_types.Value): - pat_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._pat) - ) - repl_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._pat) - ) - - return typing.cast(ibis_types.StringValue, x).replace( - pat_str_value, repl_str_value - ) - - -class ReplaceRegexOp(UnaryOp): - def __init__(self, pat: str, repl: str): - self._pat = pat - self._repl = repl - - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_replace(self._pat, self._repl) +@dataclasses.dataclass(frozen=True) +class RegexReplaceStrOp(UnaryOp): + name: typing.ClassVar[str] = "str_rereplace" + pat: str + repl: str +@dataclasses.dataclass(frozen=True) class StartsWithOp(UnaryOp): - def __init__(self, pat: typing.Sequence[str]): - self._pat = pat - - def _as_ibis(self, x: ibis_types.Value): - any_match = None - for pat in self._pat: - pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) + name: typing.ClassVar[str] = "str_startswith" + pat: typing.Sequence[str] +@dataclasses.dataclass(frozen=True) class EndsWithOp(UnaryOp): - def __init__(self, pat: typing.Sequence[str]): - self._pat = pat - - def _as_ibis(self, x: ibis_types.Value): - any_match = None - for pat in self._pat: - pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) + name: typing.ClassVar[str] = "str_endswith" + pat: typing.Sequence[str] +@dataclasses.dataclass(frozen=True) class ZfillOp(UnaryOp): - def __init__(self, width: int): - self._width = width - - def _as_ibis(self, x: ibis_types.Value): - str_value = typing.cast(ibis_types.StringValue, x) - return ( - ibis.case() - .when( - str_value[0] == "-", - "-" - + StrPadOp(self._width - 1, "0", "left")._as_ibis(str_value.substr(1)), - ) - .else_(StrPadOp(self._width, "0", "left")._as_ibis(str_value)) - .end() - ) - - -## Datetime Ops -class DayOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) - - -class DateOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).date() + name: typing.ClassVar[str] = "str_zfill" + width: int -class DayofweekOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x) - .day_of_week.index() - .cast(ibis_dtypes.int64) - ) +@dataclasses.dataclass(frozen=True) +class StrFindOp(UnaryOp): + name: typing.ClassVar[str] = "str_find" + substr: str + start: typing.Optional[int] + end: typing.Optional[int] -class HourOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) +@dataclasses.dataclass(frozen=True) +class StrExtractOp(UnaryOp): + name: typing.ClassVar[str] = "str_extract" + pat: str + n: int = 1 -class MinuteOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) - ) +@dataclasses.dataclass(frozen=True) +class StrSliceOp(UnaryOp): + name: typing.ClassVar[str] = "str_slice" + start: typing.Optional[int] + end: typing.Optional[int] -class MonthOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) +@dataclasses.dataclass(frozen=True) +class StrRepeatOp(UnaryOp): + name: typing.ClassVar[str] = "str_repeat" + repeats: int -class QuarterOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) - ) +# Other parameterized unary operations +@dataclasses.dataclass(frozen=True) +class StructFieldOp(UnaryOp): + name: typing.ClassVar[str] = "struct_field" + name_or_index: str | int -class SecondOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) - ) - - -class TimeOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).time() - - -class YearOp(UnaryOp): - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) - - -# Parameterized ops +@dataclasses.dataclass(frozen=True) class AsTypeOp(UnaryOp): - def __init__(self, to_type: dtypes.DtypeString | dtypes.Dtype): - self.to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(to_type) + name: typing.ClassVar[str] = "astype" + to_type: dtypes.DtypeString | dtypes.Dtype - def _as_ibis(self, x: ibis_types.Value): - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(self.to_type) - - return bigframes.dtypes.cast_ibis_value(x, self.to_type) +@dataclasses.dataclass(frozen=True) +class IsInOp(UnaryOp): + name: typing.ClassVar[str] = "is_in" + values: typing.Tuple + match_nulls: bool = True -class FindOp(UnaryOp): - def __init__(self, sub, start, end): - self._sub = sub - self._start = start - self._end = end - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).find( - self._sub, self._start, self._end - ) +@dataclasses.dataclass(frozen=True) +class RemoteFunctionOp(UnaryOp): + name: typing.ClassVar[str] = "remote_function" + func: typing.Callable + apply_on_null: bool -class ExtractOp(UnaryOp): - def __init__(self, pat: str, n: int = 1): - self._pat = pat - self._n = n +# Operation Composition +# Meta-ops that do partial application or parameter remapping +# Subject to change, may convert to explicit tree +@dataclasses.dataclass(frozen=True) +class ApplyRight(UnaryOp): + name: typing.ClassVar[str] = "apply_right" + base_op: BinaryOp + right_scalar: typing.Any - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_extract(self._pat, self._n) +@dataclasses.dataclass(frozen=True) +class ApplyLeft(UnaryOp): + name: typing.ClassVar[str] = "apply_left" + base_op: BinaryOp + left_scalar: typing.Any -class SliceOp(UnaryOp): - def __init__(self, start, stop): - self._start = start - self._stop = stop - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x)[self._start : self._stop] +@dataclasses.dataclass(frozen=True) +class ApplyArg1(BinaryOp): + name: typing.ClassVar[str] = "apply_arg1" + base_op: TernaryOp + scalar: typing.Any -class IsInOp(UnaryOp): - def __init__(self, values, match_nulls: bool = True): - self._values = values - self._match_nulls = match_nulls +@dataclasses.dataclass(frozen=True) +class ApplyArg3(BinaryOp): + name: typing.ClassVar[str] = "apply_arg3" + base_op: TernaryOp + scalar: typing.Any - def _as_ibis(self, x: ibis_types.Value): - contains_nulls = any(is_null(value) for value in self._values) - matchable_ibis_values = [] - for item in self._values: - if not is_null(item): - try: - # we want values that *could* be cast to the dtype, but we don't want - # to actually cast it, as that could be lossy (eg float -> int) - item_inferred_type = ibis.literal(item).type() - if ( - x.type() == item_inferred_type - or x.type().is_numeric() - and item_inferred_type.is_numeric() - ): - matchable_ibis_values.append(item) - except TypeError: - pass - if self._match_nulls and contains_nulls: - return x.isnull() | x.isin(matchable_ibis_values) - else: - return x.isin(matchable_ibis_values) +@dataclasses.dataclass(frozen=True) +class ReverseArgsOp(BinaryOp): + name: typing.ClassVar[str] = "apply_reverse" + base_op: BinaryOp -class BinopPartialRight(UnaryOp): - def __init__(self, binop: BinaryOp, right_scalar: typing.Any): - self._binop = binop - self._right = dtypes.literal_to_ibis_scalar(right_scalar, validate=False) +def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: + return ApplyLeft(base_op=op, left_scalar=scalar) - def _as_ibis(self, x): - return self._binop(x, self._right) +def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: + return ApplyRight(base_op=op, right_scalar=scalar) -class BinopPartialLeft(UnaryOp): - def __init__(self, binop: BinaryOp, left_scalar: typing.Any): - self._binop = binop - self._left = dtypes.literal_to_ibis_scalar(left_scalar, validate=False) - def _as_ibis(self, x): - return self._binop(self._left, x) +def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return ApplyArg1(base_op=op, scalar=scalar) -class RepeatOp(UnaryOp): - def __init__(self, repeats): - self._repeats = repeats +def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return ApplyArg3(base_op=op, scalar=scalar) - def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).repeat(self._repeats) +def reverse(op: BinaryOp) -> BinaryOp: + return ReverseArgsOp(base_op=op) + + +# Binary Ops +fillna_op = create_binary_op(name="fillna") +cliplower_op = create_binary_op(name="clip_lower") +clipupper_op = create_binary_op(name="clip_upper") +coalesce_op = create_binary_op(name="coalesce") +## Math Ops +add_op = create_binary_op(name="add") +sub_op = create_binary_op(name="sub") +mul_op = create_binary_op(name="mul") +div_op = create_binary_op(name="div") +floordiv_op = create_binary_op(name="floordiv") +pow_op = create_binary_op(name="pow") +mod_op = create_binary_op(name="mod") +round_op = create_binary_op(name="round") +unsafe_pow_op = create_binary_op(name="unsafe_pow_op") +# Logical Ops +and_op = create_binary_op(name="and") +or_op = create_binary_op(name="or") + +## Comparison Ops +eq_op = create_binary_op(name="eq") +eq_null_match_op = create_binary_op(name="eq_nulls_match") +ne_op = create_binary_op(name="ne") +lt_op = create_binary_op(name="lt") +gt_op = create_binary_op(name="gt") +le_op = create_binary_op(name="le") +ge_op = create_binary_op(name="ge") + +## String Ops +strconcat_op = create_binary_op(name="strconcat") + +# Ternary Ops +where_op = create_ternary_op(name="where") +clip_op = create_ternary_op(name="clip") -class RemoteFunctionOp(UnaryOp): - def __init__(self, func: typing.Callable, apply_on_null=True): - if not hasattr(func, "bigframes_remote_function"): - raise TypeError( - f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}" - ) - - self._func = func - self._apply_on_null = apply_on_null - - def _as_ibis(self, x: ibis_types.Value): - x_transformed = self._func(x) - if not self._apply_on_null: - x_transformed = where_op(x, x.isnull(), x_transformed) - return x_transformed - - -abs_op = AbsOp() -invert_op = InvertOp() -isnull_op = IsNullOp() -len_op = LenOp() -notnull_op = NotNullOp() -reverse_op = ReverseOp() -lower_op = LowerOp() -upper_op = UpperOp() -strip_op = StripOp() -isalnum_op = IsAlnumOp() -isalpha_op = IsAlphaOp() -isdecimal_op = IsDecimalOp() -isdigit_op = IsDigitOp() -isnumeric_op = IsNumericOp() -isspace_op = IsSpaceOp() -islower_op = IsLowerOp() -isupper_op = IsUpperOp() -rstrip_op = RstripOp() -lstrip_op = LstripOp() -hash_op = HashOp() -day_op = DayOp() -dayofweek_op = DayofweekOp() -date_op = DateOp() -hour_op = HourOp() -minute_op = MinuteOp() -month_op = MonthOp() -quarter_op = QuarterOp() -second_op = SecondOp() -time_op = TimeOp() -year_op = YearOp() -capitalize_op = CapitalizeOp() # Just parameterless unary ops for now # TODO: Parameter mappings NUMPY_TO_OP: typing.Final = { - np.sin: SinOp(), - np.cos: CosOp(), - np.tan: TanOp(), - np.arcsin: ArcsinOp(), - np.arccos: ArccosOp(), - np.arctan: ArctanOp(), - np.sinh: SinhOp(), - np.cosh: CoshOp(), - np.tanh: TanhOp(), - np.arcsinh: ArcsinhOp(), - np.arccosh: ArccoshOp(), - np.arctanh: ArctanhOp(), - np.exp: ExpOp(), - np.log: LnOp(), - np.log10: Log10Op(), - np.sqrt: SqrtOp(), - np.abs: AbsOp(), + np.sin: sin_op, + np.cos: cos_op, + np.tan: tan_op, + np.arcsin: arcsin_op, + np.arccos: arccos_op, + np.arctan: arctan_op, + np.sinh: sinh_op, + np.cosh: cosh_op, + np.tanh: tanh_op, + np.arcsinh: arcsinh_op, + np.arccosh: arccosh_op, + np.arctanh: arctanh_op, + np.exp: exp_op, + np.log: ln_op, + np.log10: log10_op, + np.sqrt: sqrt_op, + np.abs: abs_op, } -### Binary Ops -def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): - """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" - - def short_circuit_nulls_inner(binop): - @functools.wraps(binop) - def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or y.type()) - elif isinstance(y, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or x.type()) - else: - return binop(x, y) - - return wrapped_binop - - return short_circuit_nulls_inner - - -def concat_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_string = typing.cast(ibis_types.StringValue, x) - y_string = typing.cast(ibis_types.StringValue, y) - return x_string.concat(y_string) - - -def eq_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x == y - - -def eq_nulls_match_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" - left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) - right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$")) - return left == right - - -def ne_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x != y - - -def and_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.BooleanValue, x) & typing.cast( - ibis_types.BooleanValue, y - ) - - -def or_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.BooleanValue, x) | typing.cast( - ibis_types.BooleanValue, y - ) - - -@short_circuit_nulls() -def add_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls() -def sub_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) - typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls() -def mul_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) * typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls(ibis_dtypes.float) -def div_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) / typing.cast( - ibis_types.NumericValue, y - ) - - -@short_circuit_nulls(ibis_dtypes.float) -def pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.type().is_integer() and y.type().is_integer(): - return _int_pow_op(x, y) - else: - return _float_pow_op(x, y) - - -@short_circuit_nulls(ibis_dtypes.float) -def unsafe_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """For internal use only - where domain and overflow checks are not needed.""" - return typing.cast(ibis_types.NumericValue, x) ** typing.cast( - ibis_types.NumericValue, y - ) - - -def _int_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Need to avoid any error cases - should produce NaN instead - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_as_decimal = typing.cast( - ibis_types.NumericValue, - x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), - ) - y_val = typing.cast(ibis_types.NumericValue, y) - - # BQ POW() function outputs FLOAT64, which can lose precision. - # Therefore, we do math in NUMERIC and cast back down after. - # Also, explicit bounds checks, pandas will silently overflow. - pow_result = x_as_decimal**y_val - overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( - pow_result < _ibis_num(-(2**63)) - ) - - return ( - ibis.case() - .when((overflow_cond), ibis.null()) - .else_(pow_result.cast(ibis_dtypes.int64)) - .end() - ) - - -def _float_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_val = typing.cast(ibis_types.NumericValue, x) - y_val = typing.cast(ibis_types.NumericValue, y) - - overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) - - # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity - exp_too_big = y_val.abs() > _ibis_num(2**53) - # Treat very large exponents as +=INF - norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) - - pow_result = x_val**norm_exp - - # This cast is dangerous, need to only excuted where y_val has been bounds-checked - # Ibis needs try_cast binding to bq safe_cast - exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val - odd_exponent = (x_val < _ZERO) & ( - y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) - ) - infinite_base = x_val.abs() == _INF - - return ( - ibis.case() - # Might be able to do something more clever with x_val==0 case - .when(y_val == _ZERO, _ibis_num(1)) - .when( - x_val == _ibis_num(1), _ibis_num(1) - ) # Need to ignore exponent, even if it is NA - .when( - (x_val == _ZERO) & (y_val < _ZERO), _INF - ) # This case would error POW function in BQ - .when(infinite_base, pow_result) - .when( - exp_too_big, pow_result - ) # Bigquery can actually handle the +-inf cases gracefully - .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) - .when( - overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) - ) # finite overflows would cause bq to error - .else_(pow_result) - .end() - ) - - -@short_circuit_nulls(ibis_dtypes.bool) -def lt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x < y - - -@short_circuit_nulls(ibis_dtypes.bool) -def le_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x <= y - - -@short_circuit_nulls(ibis_dtypes.bool) -def gt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x > y - - -@short_circuit_nulls(ibis_dtypes.bool) -def ge_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x >= y - - -def coalesce_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.name("name").equals(y.name("name")): - return x - else: - return ibis.coalesce(x, y) - - -@short_circuit_nulls(ibis_dtypes.int) -def floordiv_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_numeric = typing.cast(ibis_types.NumericValue, x) - y_numeric = typing.cast(ibis_types.NumericValue, y) - floordiv_expr = x_numeric // y_numeric - - # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. - # Multiplying left by zero propogates nulls. - zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO - return ( - ibis.case() - .when(y_numeric == _ZERO, zero_result * x_numeric) - .else_(floordiv_expr) - .end() - ) - - -def _is_float(x: ibis_types.Value): - return isinstance(x, (ibis_types.FloatingColumn, ibis_types.FloatingScalar)) - - -@short_circuit_nulls() -def mod_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - is_result_float = _is_float(x) | _is_float(y) - x_numeric = typing.cast( - ibis_types.NumericValue, - x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) - if is_result_float - else x, - ) - y_numeric = typing.cast( - ibis_types.NumericValue, - y.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)) - if is_result_float - else y, - ) - # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. - op = y.op() - if isinstance(op, ibis.expr.operations.generic.Literal) and op.value == 0: - return ibis_types.null().cast(x.type()) - - bq_mod = x_numeric % y_numeric # Bigquery will maintain x sign here - if is_result_float: - bq_mod = typing.cast(ibis_types.NumericValue, bq_mod.cast(ibis_dtypes.float64)) - - # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) - return ( - ibis.case() - .when( - y_numeric == _ZERO, - _NAN * x_numeric if is_result_float else _ZERO * x_numeric, - ) # Dummy op to propogate nulls and type from x arg - .when( - (y_numeric < _ZERO) & (bq_mod > _ZERO), (y_numeric + bq_mod) - ) # Convert positive result to negative - .when( - (y_numeric > _ZERO) & (bq_mod < _ZERO), (y_numeric + bq_mod) - ) # Convert negative result to positive - .else_(bq_mod) - .end() - ) - - -def fillna_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return x.fillna(typing.cast(ibis_types.Scalar, y)) - - -def round_op(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).round( - digits=typing.cast(ibis_types.IntegerValue, y) - ) - - -def clip_lower( - value: ibis_types.Value, - lower: ibis_types.Value, -): - return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end() - - -def clip_upper( - value: ibis_types.Value, - upper: ibis_types.Value, -): - return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end() - - -def reverse(op: BinaryOp) -> BinaryOp: - return lambda x, y: op(y, x) - - -def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return BinopPartialLeft(op, scalar) - - -def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return BinopPartialRight(op, scalar) - - NUMPY_TO_BINOP: typing.Final = { np.add: add_op, np.subtract: sub_op, @@ -1038,73 +388,3 @@ def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: np.divide: div_op, np.power: pow_op, } - - -# Ternary ops -def where_op( - original: ibis_types.Value, - condition: ibis_types.Value, - replacement: ibis_types.Value, -) -> ibis_types.Value: - """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() - - -def clip_op( - original: ibis_types.Value, - lower: ibis_types.Value, - upper: ibis_types.Value, -) -> ibis_types.Value: - """Clips value to lower and upper bounds.""" - if isinstance(lower, ibis_types.NullScalar) and ( - not isinstance(upper, ibis_types.NullScalar) - ): - return ( - ibis.case() - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) - elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( - upper, ibis_types.NullScalar - ): - return ( - ibis.case() - .when(lower.isnull() | (original < lower), lower) - .else_(original) - .end() - ) - elif isinstance(lower, ibis_types.NullScalar) and ( - isinstance(upper, ibis_types.NullScalar) - ): - return original - else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound - return ( - ibis.case() - .when(lower.isnull() | (original < lower), lower) - .when(upper.isnull() | (original > upper), upper) - .else_(original) - .end() - ) - - -def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(dtypes.literal_to_ibis_scalar(scalar, validate=False), x, y) - - -def partial_arg2(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(x, dtypes.literal_to_ibis_scalar(scalar, validate=False), y) - - -def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return lambda x, y: op(x, y, dtypes.literal_to_ibis_scalar(scalar, validate=False)) - - -def is_null(value) -> bool: - # float NaN/inf should be treated as distinct from 'true' null values - return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) - - -def _ibis_num(number: float): - return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 85ce1dd9e6..8989255f7e 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -71,7 +71,7 @@ def __init__( ) if dtype: block = block.multi_apply_unary_op( - block.value_columns, ops.AsTypeOp(dtype) + block.value_columns, ops.AsTypeOp(to_type=dtype) ) self._block = block @@ -162,7 +162,7 @@ def _apply_binary_op( block.select_column(result_id).assign_label(result_id, name) ) else: - partial_op = ops.BinopPartialRight(op, other) + partial_op = ops.ApplyRight(base_op=op, right_scalar=other) return self._apply_unary_op(partial_op) def _apply_corr_aggregation(self, other: series.Series) -> float: diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 201b19abe8..2798f18b38 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -43,7 +43,7 @@ def find( start: Optional[int] = None, end: Optional[int] = None, ) -> series.Series: - return self._apply_unary_op(ops.FindOp(sub, start, end)) + return self._apply_unary_op(ops.StrFindOp(substr=sub, start=start, end=end)) def len(self) -> series.Series: return self._apply_unary_op(ops.len_op) @@ -61,7 +61,7 @@ def slice( start: Optional[int] = None, stop: Optional[int] = None, ) -> series.Series: - return self._apply_unary_op(ops.SliceOp(start, stop)) + return self._apply_unary_op(ops.StrSliceOp(start=start, end=stop)) def strip(self) -> series.Series: return self._apply_unary_op(ops.strip_op) @@ -114,7 +114,7 @@ def lstrip(self) -> series.Series: return self._apply_unary_op(ops.lstrip_op) def repeat(self, repeats: int) -> series.Series: - return self._apply_unary_op(ops.RepeatOp(repeats)) + return self._apply_unary_op(ops.StrRepeatOp(repeats=repeats)) def capitalize(self) -> series.Series: return self._apply_unary_op(ops.capitalize_op) @@ -122,38 +122,44 @@ def capitalize(self) -> series.Series: def match(self, pat, case=True, flags=0) -> series.Series: # \A anchors start of entire string rather than start of any line in multiline mode adj_pat = rf"\A{pat}" - return self.contains(adj_pat, case=case, flags=flags) + return self.contains(pat=adj_pat, case=case, flags=flags) def fullmatch(self, pat, case=True, flags=0) -> series.Series: # \A anchors start of entire string rather than start of any line in multiline mode # \z likewise anchors to the end of the entire multiline string adj_pat = rf"\A{pat}\z" - return self.contains(adj_pat, case=case, flags=flags) + return self.contains(pat=adj_pat, case=case, flags=flags) def get(self, i: int) -> series.Series: - return self._apply_unary_op(ops.StrGetOp(i)) + return self._apply_unary_op(ops.StrGetOp(i=i)) def pad(self, width, side="left", fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, side)) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side=side) + ) def ljust(self, width, fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "right")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="right") + ) def rjust(self, width, fillchar=" ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "left")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="left") + ) def contains( self, pat, case: bool = True, flags: int = 0, *, regex: bool = True ) -> series.Series: if not case: - return self.contains(pat, flags=flags | re.IGNORECASE, regex=True) + return self.contains(pat=pat, flags=flags | re.IGNORECASE, regex=True) if regex: re2flags = _parse_flags(flags) if re2flags: pat = re2flags + pat - return self._apply_unary_op(ops.ContainsRegexOp(pat)) + return self._apply_unary_op(ops.StrContainsRegexOp(pat=pat)) else: - return self._apply_unary_op(ops.ContainsStringOp(pat)) + return self._apply_unary_op(ops.StrContainsOp(pat=pat)) def extract(self, pat: str, flags: int = 0) -> df.DataFrame: re2flags = _parse_flags(flags) @@ -173,7 +179,9 @@ def extract(self, pat: str, flags: int = 0) -> df.DataFrame: ] label = labels[0] if labels else str(i) block, id = block.apply_unary_op( - self._value_column, ops.ExtractOp(pat, i + 1), result_label=label + self._value_column, + ops.StrExtractOp(pat=pat, n=i + 1), + result_label=label, ) results.append(id) block = block.select_columns(results) @@ -196,13 +204,13 @@ def replace( re2flags = _parse_flags(flags) if re2flags: patstr = re2flags + patstr - return self._apply_unary_op(ops.ReplaceRegexOp(patstr, repl)) + return self._apply_unary_op(ops.RegexReplaceStrOp(pat=patstr, repl=repl)) else: if is_compiled: raise ValueError( "Must set 'regex'=True if using compiled regex pattern." ) - return self._apply_unary_op(ops.ReplaceStringOp(patstr, repl)) + return self._apply_unary_op(ops.ReplaceStrOp(pat=patstr, repl=repl)) def startswith( self, @@ -210,7 +218,7 @@ def startswith( ) -> series.Series: if not isinstance(pat, tuple): pat = (pat,) - return self._apply_unary_op(ops.StartsWithOp(pat)) + return self._apply_unary_op(ops.StartsWithOp(pat=pat)) def endswith( self, @@ -218,13 +226,15 @@ def endswith( ) -> series.Series: if not isinstance(pat, tuple): pat = (pat,) - return self._apply_unary_op(ops.EndsWithOp(pat)) + return self._apply_unary_op(ops.EndsWithOp(pat=pat)) def zfill(self, width: int) -> series.Series: - return self._apply_unary_op(ops.ZfillOp(width)) + return self._apply_unary_op(ops.ZfillOp(width=width)) def center(self, width: int, fillchar: str = " ") -> series.Series: - return self._apply_unary_op(ops.StrPadOp(width, fillchar, "both")) + return self._apply_unary_op( + ops.StrPadOp(length=width, fillchar=fillchar, side="both") + ) def cat( self, @@ -232,7 +242,7 @@ def cat( *, join: Literal["outer", "left"] = "left", ) -> series.Series: - return self._apply_binary_op(others, ops.concat_op, alignment=join) + return self._apply_binary_op(others, ops.strconcat_op, alignment=join) def _parse_flags(flags: int) -> Optional[str]: diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index b2ae98f378..0e00b781c9 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -14,10 +14,6 @@ from __future__ import annotations -import typing - -import ibis.expr.types as ibis_types - from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations @@ -26,19 +22,6 @@ import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors -class _StructField(bigframes.operations.UnaryOp): - def __init__(self, name_or_index: str | int): - self._name_or_index = name_or_index - - def _as_ibis(self, x: ibis_types.Value): - struct_value = typing.cast(ibis_types.StructValue, x) - if isinstance(self._name_or_index, str): - name = self._name_or_index - else: - name = struct_value.names[self._name_or_index] - return struct_value[name].name(name) - - @log_adapter.class_logger class StructAccessor( bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor @@ -46,7 +29,7 @@ class StructAccessor( __doc__ = vendoracessors.StructAccessor.__doc__ def field(self, name_or_index: str | int) -> bigframes.series.Series: - series = self._apply_unary_op(_StructField(name_or_index)) + series = self._apply_unary_op(bigframes.operations.StructFieldOp(name_or_index)) if isinstance(name_or_index, str): name = name_or_index else: diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0c2c1f87aa..fd55de02dd 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -293,12 +293,12 @@ def _perform_get_dummies_block_operations( if column_label == "": new_column_label = value new_block, new_id = block.apply_unary_op( - column_id, ops.BinopPartialLeft(ops.eq_op, value) + column_id, ops.ApplyLeft(ops.eq_op, value) ) intermediate_col_ids.append(new_id) block, _ = new_block.apply_unary_op( new_id, - ops.BinopPartialRight(ops.fillna_op, False), + ops.ApplyRight(ops.fillna_op, False), result_label=new_column_label, ) if dummy_na: diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..538b45f239 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -184,7 +184,7 @@ def rename( # Will throw if value type isn't compatible with index type. block, const_id = block.create_constant(v, dtype=idx_dtype) block, cond_id = block.apply_unary_op( - idx_id, ops.BinopPartialRight(ops.ne_op, k) + idx_id, ops.ApplyRight(base_op=ops.ne_op, right_scalar=k) ) block, new_idx_id = block.apply_ternary_op( idx_id, cond_id, const_id, ops.where_op @@ -262,7 +262,7 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> Series: - return self._apply_unary_op(bigframes.operations.AsTypeOp(dtype)) + return self._apply_unary_op(bigframes.operations.AsTypeOp(to_type=dtype)) def to_pandas( self, @@ -332,7 +332,7 @@ def drop( level_id = self._resolve_levels(level or 0)[0] if _is_list_like(index): block, inverse_condition_id = block.apply_unary_op( - level_id, ops.IsInOp(index, match_nulls=True) + level_id, ops.IsInOp(values=tuple(index), match_nulls=True) ) block, condition_id = block.apply_unary_op( inverse_condition_id, ops.invert_op @@ -448,7 +448,7 @@ def replace( ) block, result_col = self._block.apply_unary_op( self._value_column, - ops.ReplaceRegexOp(to_replace, value), + ops.RegexReplaceStrOp(pat=to_replace, repl=value), result_label=self.name, ) return Series(block.select_column(result_col)) @@ -458,7 +458,7 @@ def replace( ) elif utils.is_list_like(to_replace): block, cond = self._block.apply_unary_op( - self._value_column, ops.IsInOp(to_replace) + self._value_column, ops.IsInOp(values=tuple(to_replace)) ) block, result_col = block.apply_binary_op( cond, @@ -469,7 +469,8 @@ def replace( return Series(block.select_column(result_col)) else: # Scalar block, cond = self._block.apply_unary_op( - self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace) + self._value_column, + ops.ApplyLeft(base_op=ops.eq_op, left_scalar=to_replace), ) block, result_col = block.apply_binary_op( cond, @@ -527,9 +528,9 @@ def isin(self, values) -> "Series" | None: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( - value=False - ) + return self._apply_unary_op( + ops.IsInOp(values=tuple(values), match_nulls=True) + ).fillna(value=False) def isna(self) -> "Series": return self._apply_unary_op(ops.isnull_op) @@ -885,9 +886,9 @@ def clip(self, lower, upper): if lower is None and upper is None: return self if lower is None: - return self._apply_binary_op(upper, ops.clip_upper, alignment="left") + return self._apply_binary_op(upper, ops.clipupper_op, alignment="left") if upper is None: - return self._apply_binary_op(lower, ops.clip_lower, alignment="left") + return self._apply_binary_op(lower, ops.cliplower_op, alignment="left") value_id, lower_id, upper_id, block = self._align3(lower, upper) block, result_id = block.apply_ternary_op( value_id, lower_id, upper_id, ops.clip_op @@ -1194,7 +1195,9 @@ def apply(self, func) -> Series: # to be applied before passing data to remote function, protecting from bad # inputs causing errors. reprojected_series = Series(self._block._force_reproject()) - return reprojected_series._apply_unary_op(ops.RemoteFunctionOp(func)) + return reprojected_series._apply_unary_op( + ops.RemoteFunctionOp(func=func, apply_on_null=True) + ) def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) @@ -1223,16 +1226,16 @@ def filter( block = self._block block, label_string_id = block.apply_unary_op( self._block.index_columns[0], - ops.AsTypeOp(pandas.StringDtype(storage="pyarrow")), + ops.AsTypeOp(to_type=pandas.StringDtype(storage="pyarrow")), ) if like is not None: block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsStringOp(pat=like) + label_string_id, ops.StrContainsOp(pat=like) ) else: # regex assert regex is not None block, mask_id = block.apply_unary_op( - label_string_id, ops.ContainsRegexOp(pat=regex) + label_string_id, ops.StrContainsRegexOp(pat=regex) ) block = block.filter(mask_id) @@ -1242,7 +1245,7 @@ def filter( # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block block, mask_id = block.apply_unary_op( - self._block.index_columns[0], ops.IsInOp(values=list(items)) + self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) block = block.filter(mask_id) block = block.select_columns([self._value_column]) From 14b50985341b5eaf88b8f540d498c4dda63a39f8 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 8 Jan 2024 21:58:04 +0000 Subject: [PATCH 2/4] fix 3-valued logic --- bigframes/core/compile/scalar_op_compiler.py | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 34230024a6..ff74120c67 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -664,11 +664,29 @@ def ne_op( return x != y +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + @scalar_op_compiler.register_binary_op(ops.and_op) def and_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) return typing.cast(ibis_types.BooleanValue, x) & typing.cast( ibis_types.BooleanValue, y ) @@ -679,6 +697,16 @@ def or_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) return typing.cast(ibis_types.BooleanValue, x) | typing.cast( ibis_types.BooleanValue, y ) From d50c9f490538c76617ab4755232c72609ada3248 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 8 Jan 2024 23:18:16 +0000 Subject: [PATCH 3/4] add op registry docstrings --- bigframes/core/compile/scalar_op_compiler.py | 41 ++++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index ff74120c67..d711dbf456 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -61,6 +61,16 @@ def register_unary_op( op_ref: typing.Union[ops.UnaryOp, type[ops.UnaryOp]], pass_op: bool = False, ): + """ + Decorator to register a unary op implementation. + + Args: + op_ref (UnaryOp or UnaryOp type): + Class or instance of operator that is implemented by the decorated function. + pass_op (bool): + Set to true if implementation takes the operator object as the last argument. + This is needed for parameterized ops where parameters are part of op object. + """ key = typing.cast(str, op_ref.name) def decorator(impl: typing.Callable[..., ibis_types.Value]): @@ -80,6 +90,16 @@ def register_binary_op( op_ref: typing.Union[ops.BinaryOp, type[ops.BinaryOp]], pass_op: bool = False, ): + """ + Decorator to register a binary op implementation. + + Args: + op_ref (BinaryOp or BinaryOp type): + Class or instance of operator that is implemented by the decorated function. + pass_op (bool): + Set to true if implementation takes the operator object as the last argument. + This is needed for parameterized ops where parameters are part of op object. + """ key = typing.cast(str, op_ref.name) def decorator(impl: typing.Callable[..., ibis_types.Value]): @@ -97,6 +117,13 @@ def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): def register_ternary_op( self, op_ref: typing.Union[ops.TernaryOp, type[ops.TernaryOp]] ): + """ + Decorator to register a ternary op implementation. + + Args: + op_ref (TernaryOp or TernaryOp type): + Class or instance of operator that is implemented by the decorated function. + """ key = typing.cast(str, op_ref.name) def decorator(impl: typing.Callable[..., ibis_types.Value]): @@ -719,10 +746,16 @@ def add_op( y: ibis_types.Value, ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) + return ibis.null() + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc @scalar_op_compiler.register_binary_op(ops.sub_op) From dae34df6db18d19e2e30aa85c11deeb4c2564ea6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 9 Jan 2024 00:26:02 +0000 Subject: [PATCH 4/4] Give MapOp a name --- bigframes/operations/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 25a874fd91..3ef551e453 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -267,6 +267,7 @@ class RemoteFunctionOp(UnaryOp): @dataclasses.dataclass(frozen=True) class MapOp(UnaryOp): + name = "map_values" mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...]