diff --git a/CHANGELOG.md b/CHANGELOG.md index a8ebb7a417..bee4ecf095 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,33 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.37.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.36.0...v1.37.0) (2025-02-19) + + +### Features + +* JSON dtype support for read_pandas and Series constructor ([#1391](https://github.com/googleapis/python-bigquery-dataframes/issues/1391)) ([44f4137](https://github.com/googleapis/python-bigquery-dataframes/commit/44f4137adb02790e07c696f0641bc58390857210)) +* Support add, sub, mult, div, and more between timedeltas ([#1396](https://github.com/googleapis/python-bigquery-dataframes/issues/1396)) ([ffa63d4](https://github.com/googleapis/python-bigquery-dataframes/commit/ffa63d47ca1dd1a18617f44d9b3bc33419656a20)) +* Support comparison, ordering, and filtering for timedeltas ([#1387](https://github.com/googleapis/python-bigquery-dataframes/issues/1387)) ([34d01b2](https://github.com/googleapis/python-bigquery-dataframes/commit/34d01b27f867abf10bddffdf4f88fa7052cd237c)) +* Support subtraction in DATETIME/TIMESTAMP columns with timedelta columns ([#1390](https://github.com/googleapis/python-bigquery-dataframes/issues/1390)) ([50ad3a5](https://github.com/googleapis/python-bigquery-dataframes/commit/50ad3a56e9bd77bb77d60d7d5ec497e3335a7177)) + + +### Bug Fixes + +* Ensure binops with pandas objects returns bigquery dataframes ([#1404](https://github.com/googleapis/python-bigquery-dataframes/issues/1404)) ([3cee24b](https://github.com/googleapis/python-bigquery-dataframes/commit/3cee24bae1d352015a5b6a8c18d5c394293d08fd)) + + +### Performance Improvements + +* Prune projections more aggressively ([#1398](https://github.com/googleapis/python-bigquery-dataframes/issues/1398)) ([7990262](https://github.com/googleapis/python-bigquery-dataframes/commit/7990262cf09e97c0739be922ede151d616655726)) +* Simplify sum aggregate SQL text ([#1395](https://github.com/googleapis/python-bigquery-dataframes/issues/1395)) ([0145656](https://github.com/googleapis/python-bigquery-dataframes/commit/0145656e5e378442f2f38f9f04e87e33ddf345f5)) +* Use simple null constraints to simplify queries ([#1381](https://github.com/googleapis/python-bigquery-dataframes/issues/1381)) ([00611d4](https://github.com/googleapis/python-bigquery-dataframes/commit/00611d4d697a8b74451375f5a7700b92a4410295)) + + +### Documentation + +* Add DataFrame.struct docs ([#1348](https://github.com/googleapis/python-bigquery-dataframes/issues/1348)) ([7e9e93a](https://github.com/googleapis/python-bigquery-dataframes/commit/7e9e93aafd26cbfec9a1710caaf97937bcb6ee05)) + ## [1.36.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.35.0...v1.36.0) (2025-02-11) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index a05030140e..14c7a72e78 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -130,6 +130,193 @@ def image_blur_func( image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) +def image_blur_to_bytes_func(src_obj_ref_rt: str, ksize_x: int, ksize_y: int) -> bytes: + import json + + import cv2 as cv # type: ignore + import numpy as np + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url) + bts = response.content + + nparr = np.frombuffer(bts, np.uint8) + img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) + bts = cv.imencode(".jpeg", img_blurred)[1].tobytes() + + return bts + + +image_blur_to_bytes_def = FunctionDef( + image_blur_to_bytes_func, ["opencv-python", "numpy", "requests"] +) + + +def image_resize_func( + src_obj_ref_rt: str, + dst_obj_ref_rt: str, + dsize_x: int, + dsize_y: int, + fx: float, + fy: float, +) -> str: + import json + + import cv2 as cv # type: ignore + import numpy as np + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + dst_obj_ref_rt_json = json.loads(dst_obj_ref_rt) + + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] + + response = requests.get(src_url) + bts = response.content + + nparr = np.frombuffer(bts, np.uint8) + img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) + bts = cv.imencode(".jpeg", img_resized)[1].tobytes() + + requests.put( + url=dst_url, + data=bts, + headers={ + "Content-Type": "image/jpeg", + }, + ) + + return dst_obj_ref_rt + + +image_resize_def = FunctionDef( + image_resize_func, ["opencv-python", "numpy", "requests"] +) + + +def image_resize_to_bytes_func( + src_obj_ref_rt: str, + dsize_x: int, + dsize_y: int, + fx: float, + fy: float, +) -> bytes: + import json + + import cv2 as cv # type: ignore + import numpy as np + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url) + bts = response.content + + nparr = np.frombuffer(bts, np.uint8) + img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) + bts = cv.imencode(".jpeg", img_resized)[1].tobytes() + + return bts + + +image_resize_to_bytes_def = FunctionDef( + image_resize_to_bytes_func, ["opencv-python", "numpy", "requests"] +) + + +def image_normalize_func( + src_obj_ref_rt: str, dst_obj_ref_rt: str, alpha: float, beta: float, norm_type: str +) -> str: + import json + + import cv2 as cv # type: ignore + import numpy as np + import requests + + norm_type_mapping = { + "inf": cv.NORM_INF, + "l1": cv.NORM_L1, + "l2": cv.NORM_L2, + "minmax": cv.NORM_MINMAX, + } + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + dst_obj_ref_rt_json = json.loads(dst_obj_ref_rt) + + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] + + response = requests.get(src_url) + bts = response.content + + nparr = np.frombuffer(bts, np.uint8) + img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + img_normalized = cv.normalize( + img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] + ) + bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() + + requests.put( + url=dst_url, + data=bts, + headers={ + "Content-Type": "image/jpeg", + }, + ) + + return dst_obj_ref_rt + + +image_normalize_def = FunctionDef( + image_normalize_func, ["opencv-python", "numpy", "requests"] +) + + +def image_normalize_to_bytes_func( + src_obj_ref_rt: str, alpha: float, beta: float, norm_type: str +) -> bytes: + import json + + import cv2 as cv # type: ignore + import numpy as np + import requests + + norm_type_mapping = { + "inf": cv.NORM_INF, + "l1": cv.NORM_L1, + "l2": cv.NORM_L2, + "minmax": cv.NORM_MINMAX, + } + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url) + bts = response.content + + nparr = np.frombuffer(bts, np.uint8) + img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + img_normalized = cv.normalize( + img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] + ) + bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() + + return bts + + +image_normalize_to_bytes_def = FunctionDef( + image_normalize_to_bytes_func, ["opencv-python", "numpy", "requests"] +) + + # Extracts all text from a PDF url def pdf_extract_func(src_obj_ref_rt: str) -> str: import io diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py new file mode 100644 index 0000000000..32c7f92912 --- /dev/null +++ b/bigframes/core/bigframe_node.py @@ -0,0 +1,384 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +import collections +import dataclasses +import functools +import itertools +import typing +from typing import Callable, Dict, Generator, Iterable, Mapping, Set, Tuple + +from bigframes.core import identifiers +import bigframes.core.guid +import bigframes.core.schema as schemata +import bigframes.dtypes + +if typing.TYPE_CHECKING: + import bigframes.session + +COLUMN_SET = frozenset[identifiers.ColumnId] + + +@dataclasses.dataclass(frozen=True) +class Field: + id: identifiers.ColumnId + dtype: bigframes.dtypes.Dtype + # Best effort, nullable=True if not certain + nullable: bool = True + + def with_nullable(self) -> Field: + return Field(self.id, self.dtype, nullable=True) + + def with_nonnull(self) -> Field: + return Field(self.id, self.dtype, nullable=False) + + def with_id(self, id: identifiers.ColumnId) -> Field: + return Field(id, self.dtype, nullable=self.nullable) + + +@dataclasses.dataclass(eq=False, frozen=True) +class BigFrameNode: + """ + Immutable node for representing 2D typed array as a tree of operators. + + All subclasses must be hashable so as to be usable as caching key. + """ + + @property + def deterministic(self) -> bool: + """Whether this node will evaluates deterministically.""" + return True + + @property + def row_preserving(self) -> bool: + """Whether this node preserves input rows.""" + return True + + @property + def non_local(self) -> bool: + """ + Whether this node combines information across multiple rows instead of processing rows independently. + Used as an approximation for whether the expression may require shuffling to execute (and therefore be expensive). + """ + return False + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + """Direct children of this node""" + return tuple([]) + + @property + @abc.abstractmethod + def row_count(self) -> typing.Optional[int]: + return None + + @abc.abstractmethod + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> BigFrameNode: + """Remap variable references""" + ... + + @property + @abc.abstractmethod + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: + """The variables defined in this node (as opposed to by child nodes).""" + ... + + @functools.cached_property + def session(self): + sessions = [] + for child in self.child_nodes: + if child.session is not None: + sessions.append(child.session) + unique_sessions = len(set(sessions)) + if unique_sessions > 1: + raise ValueError("Cannot use combine sources from multiple sessions.") + elif unique_sessions == 1: + return sessions[0] + return None + + def _validate(self): + """Validate the local data in the node.""" + return + + @functools.cache + def validate_tree(self) -> bool: + for child in self.child_nodes: + child.validate_tree() + self._validate() + field_list = list(self.fields) + if len(set(field_list)) != len(field_list): + raise ValueError(f"Non unique field ids {list(self.fields)}") + return True + + def _as_tuple(self) -> Tuple: + """Get all fields as tuple.""" + return tuple(getattr(self, field.name) for field in dataclasses.fields(self)) + + def __hash__(self) -> int: + # Custom hash that uses cache to avoid costly recomputation + return self._cached_hash + + def __eq__(self, other) -> bool: + # Custom eq that tries to short-circuit full structural comparison + if not isinstance(other, self.__class__): + return False + if self is other: + return True + if hash(self) != hash(other): + return False + return self._as_tuple() == other._as_tuple() + + # BigFrameNode trees can be very deep so its important avoid recalculating the hash from scratch + # Each subclass of BigFrameNode should use this property to implement __hash__ + # The default dataclass-generated __hash__ method is not cached + @functools.cached_property + def _cached_hash(self): + return hash(self._as_tuple()) + + @property + def roots(self) -> typing.Set[BigFrameNode]: + roots = itertools.chain.from_iterable( + map(lambda child: child.roots, self.child_nodes) + ) + return set(roots) + + # TODO: Store some local data lazily for select, aggregate nodes. + @property + @abc.abstractmethod + def fields(self) -> Iterable[Field]: + ... + + @property + def ids(self) -> Iterable[identifiers.ColumnId]: + """All output ids from the node.""" + return (field.id for field in self.fields) + + @property + @abc.abstractmethod + def variables_introduced(self) -> int: + """ + Defines number of values created by the current node. Helps represent the "width" of a query + """ + ... + + @property + def relation_ops_created(self) -> int: + """ + Defines the number of relational ops generated by the current node. Used to estimate query planning complexity. + """ + return 1 + + @property + def joins(self) -> bool: + """ + Defines whether the node joins data. + """ + return False + + @property + @abc.abstractmethod + def order_ambiguous(self) -> bool: + """ + Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. + """ + ... + + @property + @abc.abstractmethod + def explicitly_ordered(self) -> bool: + """ + Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. + """ + ... + + @functools.cached_property + def height(self) -> int: + if len(self.child_nodes) == 0: + return 0 + return max(child.height for child in self.child_nodes) + 1 + + @functools.cached_property + def total_variables(self) -> int: + return self.variables_introduced + sum( + map(lambda x: x.total_variables, self.child_nodes) + ) + + @functools.cached_property + def total_relational_ops(self) -> int: + return self.relation_ops_created + sum( + map(lambda x: x.total_relational_ops, self.child_nodes) + ) + + @functools.cached_property + def total_joins(self) -> int: + return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes)) + + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + # TODO: Make schema just a view on fields + return schemata.ArraySchema( + tuple(schemata.SchemaItem(i.id.name, i.dtype) for i in self.fields) + ) + + @property + def planning_complexity(self) -> int: + """ + Empirical heuristic measure of planning complexity. + + Used to determine when to decompose overly complex computations. May require tuning. + """ + return self.total_variables * self.total_relational_ops * (1 + self.total_joins) + + @abc.abstractmethod + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + """Apply a function to each child node.""" + ... + + @abc.abstractmethod + def remap_vars( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> BigFrameNode: + """Remap defined (in this node only) variables.""" + ... + + @property + def defines_namespace(self) -> bool: + """ + If true, this node establishes a new column id namespace. + + If false, this node consumes and produces ids in the namespace + """ + return False + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + + @functools.cached_property + def defined_variables(self) -> set[str]: + """Full set of variables defined in the namespace, even if not selected.""" + self_defined_variables = set(self.schema.names) + if self.defines_namespace: + return self_defined_variables + return self_defined_variables.union( + *(child.defined_variables for child in self.child_nodes) + ) + + def get_type(self, id: identifiers.ColumnId) -> bigframes.dtypes.Dtype: + return self._dtype_lookup[id] + + # TODO: Deprecate in favor of field_by_id, and eventually, by rich references + @functools.cached_property + def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]: + return {field.id: field.dtype for field in self.fields} + + @functools.cached_property + def field_by_id(self) -> Mapping[identifiers.ColumnId, Field]: + return {field.id: field for field in self.fields} + + # Plan algorithms + def unique_nodes( + self: BigFrameNode, + ) -> Generator[BigFrameNode, None, None]: + """Walks the tree for unique nodes""" + seen = set() + stack: list[BigFrameNode] = [self] + while stack: + item = stack.pop() + if item not in seen: + yield item + seen.add(item) + stack.extend(item.child_nodes) + + def edges( + self: BigFrameNode, + ) -> Generator[Tuple[BigFrameNode, BigFrameNode], None, None]: + for item in self.unique_nodes(): + for child in item.child_nodes: + yield (item, child) + + def iter_nodes_topo(self: BigFrameNode) -> Generator[BigFrameNode, None, None]: + """Returns nodes from bottom up.""" + queue = collections.deque( + [node for node in self.unique_nodes() if not node.child_nodes] + ) + + child_to_parents: Dict[ + BigFrameNode, Set[BigFrameNode] + ] = collections.defaultdict(set) + for parent, child in self.edges(): + child_to_parents[child].add(parent) + + yielded = set() + + while queue: + item = queue.popleft() + yield item + yielded.add(item) + for parent in child_to_parents[item]: + if set(parent.child_nodes).issubset(yielded): + queue.append(parent) + + def top_down( + self: BigFrameNode, + transform: Callable[[BigFrameNode], BigFrameNode], + ) -> BigFrameNode: + """ + Perform a top-down transformation of the BigFrameNode tree. + """ + to_process = [self] + results: Dict[BigFrameNode, BigFrameNode] = {} + + while to_process: + item = to_process.pop() + if item not in results.keys(): + item_result = transform(item) + results[item] = item_result + to_process.extend(item_result.child_nodes) + + to_process = [self] + # for each processed item, replace its children + for item in reversed(list(results.keys())): + results[item] = results[item].transform_children(lambda x: results[x]) + + return results[self] + + def bottom_up( + self: BigFrameNode, + transform: Callable[[BigFrameNode], BigFrameNode], + ) -> BigFrameNode: + """ + Perform a bottom-up transformation of the BigFrameNode tree. + + The `transform` function is applied to each node *after* its children + have been transformed. This allows for transformations that depend + on the results of transforming subtrees. + + Returns the transformed root node. + """ + results: dict[BigFrameNode, BigFrameNode] = {} + for node in list(self.iter_nodes_topo()): + # child nodes have already been transformed + result = node.transform_children(lambda x: results[x]) + result = transform(result) + results[node] = result + + return results[self] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 8d3732f3fe..10970b24e8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2049,7 +2049,6 @@ def concat( def isin(self, other: Block): # TODO: Support multiple other columns and match on label - # TODO: Model as explicit "IN" subquery/join to better allow db to optimize assert len(other.value_columns) == 1 unique_other_values = other.expr.select_columns( [other.value_columns[0]] diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 02c7ae128b..91a96febe0 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -164,9 +164,7 @@ def _( ) -> ibis_types.NumericValue: # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) - return ( - ibis_api.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore - ) + return bq_sum.fillna(ibis_types.literal(0)) @compile_unary_agg.register diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 93be998b5b..b0cf30269e 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -16,7 +16,7 @@ import functools import itertools import typing -from typing import Optional, Sequence +from typing import Literal, Optional, Sequence import bigframes_vendored.ibis import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery @@ -94,7 +94,7 @@ def to_sql( return typing.cast(str, sql) @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + def columns(self) -> tuple[ibis_types.Value, ...]: return self._columns @property @@ -107,7 +107,7 @@ def _ibis_bindings(self) -> dict[str, ibis_types.Value]: def projection( self, - expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], + expression_id_pairs: tuple[tuple[ex.Expression, str], ...], ) -> UnorderedIR: """Apply an expression to the ArrayValue and assign the output to a column.""" cannot_inline = any(expr.expensive for expr, _ in expression_id_pairs) @@ -126,7 +126,7 @@ def projection( def selection( self, - input_output_pairs: typing.Tuple[typing.Tuple[ex.DerefOp, str], ...], + input_output_pairs: tuple[tuple[ex.DerefOp, str], ...], ) -> UnorderedIR: """Apply an expression to the ArrayValue and assign the output to a column.""" bindings = {col: self._get_ibis_column(col) for col in self.column_ids} @@ -203,9 +203,8 @@ def filter(self, predicate: ex.Expression) -> UnorderedIR: def aggregate( self, - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], + aggregations: typing.Sequence[tuple[ex.Aggregation, str]], by_column_ids: typing.Sequence[ex.DerefOp] = (), - dropna: bool = True, order_by: typing.Sequence[OrderingExpression] = (), ) -> UnorderedIR: """ @@ -230,10 +229,6 @@ def aggregate( for aggregate, col_out in aggregations } if by_column_ids: - if dropna: - table = table.filter( - [table[ref.id.sql].notnull() for ref in by_column_ids] - ) result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate( **stats ) @@ -323,7 +318,105 @@ def from_pandas( columns=columns, ) - ## Methods that only work with ordering + def join( + self: UnorderedIR, + right: UnorderedIR, + conditions: tuple[tuple[str, str], ...], + type: Literal["inner", "outer", "left", "right", "cross"], + *, + join_nulls: bool = True, + ) -> UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + left_column_ids: Column IDs (not label) to join by. + right: Expression for right table to join. + right_column_ids: Column IDs (not label) to join by. + how: The type of join to perform. + join_nulls (bool): + If True, will joins NULL keys to each other. + Returns: + The joined expression. The resulting columns will be, in order, + first the coalesced join keys, then, all the left columns, and + finally, all the right columns. + """ + # Shouldn't need to select the column ids explicitly, but it seems that ibis has some + # bug resolving column ids otherwise, potentially because of the "JoinChain" op + left_table = self._to_ibis_expr().select(self.column_ids) + right_table = right._to_ibis_expr().select(right.column_ids) + + join_conditions = [ + _join_condition( + left_table[left_index], right_table[right_index], nullsafe=join_nulls + ) + for left_index, right_index in conditions + ] + + combined_table = bigframes_vendored.ibis.join( + left_table, + right_table, + predicates=join_conditions, + how=type, # type: ignore + ) + columns = [combined_table[col.get_name()] for col in self.columns] + [ + combined_table[col.get_name()] for col in right.columns + ] + return UnorderedIR( + combined_table, + columns=columns, + ) + + def isin_join( + self: UnorderedIR, + right: UnorderedIR, + indicator_col: str, + conditions: tuple[str, str], + *, + join_nulls: bool = True, + ) -> UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + right: Expression for right table to join. + conditions: Id pairs to compare + Returns: + The joined expression. + """ + left_table = self._to_ibis_expr() + right_table = right._to_ibis_expr() + if join_nulls: # nullsafe isin join must actually use "exists" subquery + new_column = ( + ( + _join_condition( + left_table[conditions[0]], + right_table[conditions[1]], + nullsafe=True, + ) + ) + .any() + .name(indicator_col) + ) + + else: # Can do simpler "in" subquery + new_column = ( + (left_table[conditions[0]]) + .isin((right_table[conditions[1]])) + .name(indicator_col) + ) + + columns = tuple( + itertools.chain( + (left_table[col.get_name()] for col in self.columns), (new_column,) + ) + ) + + return UnorderedIR( + left_table, + columns=columns, + ) + def project_window_op( self, expression: ex.Aggregation, @@ -429,7 +522,7 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec): group_by: typing.List[ibis_types.Value] = ( [ typing.cast( - ibis_types.Column, _as_identity(self._compile_expression(column)) + ibis_types.Column, _as_groupable(self._compile_expression(column)) ) for column in window_spec.grouping_keys ] @@ -514,7 +607,68 @@ def _convert_ordering_to_table_values( return ordering_values -def _as_identity(value: ibis_types.Value): +def _string_cast_join_cond( + lvalue: ibis_types.Column, rvalue: ibis_types.Column +) -> ibis_types.BooleanColumn: + result = ( + lvalue.cast(ibis_dtypes.str).fill_null(ibis_types.literal("0")) + == rvalue.cast(ibis_dtypes.str).fill_null(ibis_types.literal("0")) + ) & ( + lvalue.cast(ibis_dtypes.str).fill_null(ibis_types.literal("1")) + == rvalue.cast(ibis_dtypes.str).fill_null(ibis_types.literal("1")) + ) + return typing.cast(ibis_types.BooleanColumn, result) + + +def _numeric_join_cond( + lvalue: ibis_types.Column, rvalue: ibis_types.Column +) -> ibis_types.BooleanColumn: + lvalue1 = lvalue.fill_null(ibis_types.literal(0)) + lvalue2 = lvalue.fill_null(ibis_types.literal(1)) + rvalue1 = rvalue.fill_null(ibis_types.literal(0)) + rvalue2 = rvalue.fill_null(ibis_types.literal(1)) + if lvalue.type().is_floating() and rvalue.type().is_floating(): + # NaN aren't equal so need to coalesce as well with diff constants + lvalue1 = ( + typing.cast(ibis_types.FloatingColumn, lvalue) + .isnan() + .ifelse(ibis_types.literal(2), lvalue1) + ) + lvalue2 = ( + typing.cast(ibis_types.FloatingColumn, lvalue) + .isnan() + .ifelse(ibis_types.literal(3), lvalue2) + ) + rvalue1 = ( + typing.cast(ibis_types.FloatingColumn, rvalue) + .isnan() + .ifelse(ibis_types.literal(2), rvalue1) + ) + rvalue2 = ( + typing.cast(ibis_types.FloatingColumn, rvalue) + .isnan() + .ifelse(ibis_types.literal(3), rvalue2) + ) + result = (lvalue1 == rvalue1) & (lvalue2 == rvalue2) + return typing.cast(ibis_types.BooleanColumn, result) + + +def _join_condition( + lvalue: ibis_types.Column, rvalue: ibis_types.Column, nullsafe: bool +) -> ibis_types.BooleanColumn: + if (lvalue.type().is_floating()) and (lvalue.type().is_floating()): + # Need to always make safe join condition to handle nan, even if no nulls + return _numeric_join_cond(lvalue, rvalue) + if nullsafe: + # TODO: Define more coalesce constants for non-numeric types to avoid cast + if (lvalue.type().is_numeric()) and (lvalue.type().is_numeric()): + return _numeric_join_cond(lvalue, rvalue) + else: + return _string_cast_join_cond(lvalue, rvalue) + return typing.cast(ibis_types.BooleanColumn, lvalue == rvalue) + + +def _as_groupable(value: ibis_types.Value): # Some types need to be converted to string to enable groupby if value.type().is_float64() or value.type().is_geospatial(): return value.cast(ibis_dtypes.str) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 64a0ae265f..9b271bf67b 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -20,20 +20,19 @@ import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery import bigframes_vendored.ibis.expr.api as ibis_api +import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types import google.cloud.bigquery import pandas as pd +from bigframes import dtypes, operations from bigframes.core import utils import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.explode import bigframes.core.compile.ibis_types -import bigframes.core.compile.isin -import bigframes.core.compile.scalar_op_compiler import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.compile.schema_translator -import bigframes.core.compile.single_column import bigframes.core.expression as ex import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes @@ -130,24 +129,25 @@ def compile_join(self, node: nodes.JoinNode): condition_pairs = tuple( (left.id.sql, right.id.sql) for left, right in node.conditions ) + left_unordered = self.compile_node(node.left_child) right_unordered = self.compile_node(node.right_child) - return bigframes.core.compile.single_column.join_by_column_unordered( - left=left_unordered, + return left_unordered.join( right=right_unordered, type=node.type, conditions=condition_pairs, + join_nulls=node.joins_nulls, ) @_compile_node.register def compile_isin(self, node: nodes.InNode): left_unordered = self.compile_node(node.left_child) right_unordered = self.compile_node(node.right_child) - return bigframes.core.compile.isin.isin_unordered( - left=left_unordered, + return left_unordered.isin_join( right=right_unordered, indicator_col=node.indicator_col.sql, conditions=(node.left_col.id.sql, node.right_col.id.sql), + join_nulls=node.joins_nulls, ) @_compile_node.register @@ -225,6 +225,18 @@ def compile_read_table_unordered( ibis_table = self.read_table_as_unordered_ibis( source, scan_cols=[col.source_id for col in scan.items] ) + + # TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. + for scan_item in scan.items: + if ( + scan_item.dtype == dtypes.JSON_DTYPE + and ibis_table[scan_item.source_id].type() == ibis_dtypes.string + ): + json_column = compile_scalar.parse_json( + ibis_table[scan_item.source_id] + ).name(scan_item.source_id) + ibis_table = ibis_table.mutate(json_column) + return compiled.UnorderedIR( ibis_table, tuple( @@ -266,8 +278,13 @@ def compile_rowcount(self, node: nodes.RowCountNode): def compile_aggregate(self, node: nodes.AggregateNode): aggs = tuple((agg, id.sql) for agg, id in node.aggregations) result = self.compile_node(node.child).aggregate( - aggs, node.by_column_ids, node.dropna, order_by=node.order_by + aggs, node.by_column_ids, order_by=node.order_by ) + # TODO: Remove dropna field and use filter node instead + if node.dropna: + for key in node.by_column_ids: + if node.child.field_by_id[key.id].nullable: + result = result.filter(operations.notnull_op.as_expr(key)) return result @_compile_node.register diff --git a/bigframes/core/compile/isin.py b/bigframes/core/compile/isin.py deleted file mode 100644 index 29acf9e284..0000000000 --- a/bigframes/core/compile/isin.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helpers to join ArrayValue objects.""" - -from __future__ import annotations - -import itertools -from typing import Tuple - -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.types as ibis_types - -import bigframes.core.compile.compiled as compiled - - -def isin_unordered( - left: compiled.UnorderedIR, - right: compiled.UnorderedIR, - indicator_col: str, - conditions: Tuple[str, str], -) -> compiled.UnorderedIR: - """Join two expressions by column equality. - - Arguments: - left: Expression for left table to join. - right: Expression for right table to join. - conditions: Id pairs to compare - Returns: - The joined expression. - """ - left_table = left._to_ibis_expr() - right_table = right._to_ibis_expr() - new_column = ( - value_to_join_key(left_table[conditions[0]]) - .isin(value_to_join_key(right_table[conditions[1]])) - .name(indicator_col) - ) - - columns = tuple( - itertools.chain( - (left_table[col.get_name()] for col in left.columns), (new_column,) - ) - ) - - return compiled.UnorderedIR( - left_table, - columns=columns, - ) - - -def value_to_join_key(value: ibis_types.Value): - """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" - if not value.type().is_string(): - value = value.cast(ibis_dtypes.str) - return ( - value.fill_null(ibis_types.literal("$NULL_SENTINEL$")) - if hasattr(value, "fill_null") - else value.fillna(ibis_types.literal("$NULL_SENTINEL$")) - ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 3e5f10eca4..d5ce6e9e09 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -747,6 +747,11 @@ def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal return x + y.to_interval("us") +@scalar_op_compiler.register_binary_op(ops.timestamp_sub_op) +def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x - y.to_interval("us") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py deleted file mode 100644 index 9216051d91..0000000000 --- a/bigframes/core/compile/single_column.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helpers to join ArrayValue objects.""" - -from __future__ import annotations - -from typing import Literal, Tuple - -import bigframes_vendored.ibis.expr.api as ibis_api -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.types as ibis_types - -import bigframes.core.compile.compiled as compiled - - -def join_by_column_unordered( - left: compiled.UnorderedIR, - right: compiled.UnorderedIR, - conditions: Tuple[Tuple[str, str], ...], - type: Literal["inner", "outer", "left", "right", "cross"], -) -> compiled.UnorderedIR: - """Join two expressions by column equality. - - Arguments: - left: Expression for left table to join. - left_column_ids: Column IDs (not label) to join by. - right: Expression for right table to join. - right_column_ids: Column IDs (not label) to join by. - how: The type of join to perform. - allow_row_identity_join (bool): - If True, allow matching by row identity. Set to False to always - perform a true JOIN in generated SQL. - Returns: - The joined expression. The resulting columns will be, in order, - first the coalesced join keys, then, all the left columns, and - finally, all the right columns. - """ - # Shouldn't need to select the column ids explicitly, but it seems that ibis has some - # bug resolving column ids otherwise, potentially because of the "JoinChain" op - left_table = left._to_ibis_expr().select(left.column_ids) - right_table = right._to_ibis_expr().select(right.column_ids) - join_conditions = [ - value_to_join_key(left_table[left_index]) - == value_to_join_key(right_table[right_index]) - for left_index, right_index in conditions - ] - - combined_table = ibis_api.join( - left_table, - right_table, - predicates=join_conditions, - how=type, # type: ignore - ) - columns = [combined_table[col.get_name()] for col in left.columns] + [ - combined_table[col.get_name()] for col in right.columns - ] - return compiled.UnorderedIR( - combined_table, - columns=columns, - ) - - -def value_to_join_key(value: ibis_types.Value): - """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" - if not value.type().is_string(): - value = value.cast(ibis_dtypes.str) - return ( - value.fill_null(ibis_types.literal("$NULL_SENTINEL$")) - if hasattr(value, "fill_null") - else value.fillna(ibis_types.literal("$NULL_SENTINEL$")) - ) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 8621d5d915..afd290827d 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -165,6 +165,10 @@ def expensive(self) -> bool: isinstance(ex, OpExpression) and ex.op.expensive for ex in self.walk() ) + @property + def nullable(self) -> bool: + return True + @property @abc.abstractmethod def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: @@ -248,6 +252,10 @@ def is_const(self) -> bool: def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: return () + @property + def nullable(self) -> bool: + return pd.isna(self.value) # type: ignore + def output_type( self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: @@ -344,6 +352,11 @@ def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: def is_const(self) -> bool: return False + @property + def nullable(self) -> bool: + # Safe default, need to actually bind input schema to determine + return True + def output_type( self, input_types: dict[ids.ColumnId, bigframes.dtypes.Dtype] ) -> dtypes.ExpressionType: @@ -408,6 +421,14 @@ def is_const(self) -> bool: def children(self): return self.inputs + @property + def nullable(self) -> bool: + # This is very conservative, need to label null properties of individual ops to get more precise + null_free = self.is_identity and not any( + child.nullable for child in self.inputs + ) + return not null_free + def output_type( self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] ) -> dtypes.ExpressionType: diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 6ad0973262..b3a07d33bc 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -48,6 +48,8 @@ class Index(vendored_pandas_index.Index): _linked_frame: Union[ bigframes.dataframe.DataFrame, bigframes.series.Series, None ] = None + # Must be above 5000 for pandas to delegate to bigframes for binops + __pandas_priority__ = 12000 # Overrided on __new__ to create subclasses like pandas does def __new__( diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index d5273e5c0a..e2093e57d9 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,11 +20,12 @@ import functools import itertools import typing -from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple, TypeVar +from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple import google.cloud.bigquery as bq from bigframes.core import identifiers +from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingExpression @@ -41,246 +42,6 @@ # A fixed number of variable to assume for overhead on some operations OVERHEAD_VARIABLES = 5 -COLUMN_SET = frozenset[identifiers.ColumnId] - -Self = TypeVar("Self") - - -@dataclasses.dataclass(frozen=True) -class Field: - id: identifiers.ColumnId - dtype: bigframes.dtypes.Dtype - - -@dataclasses.dataclass(eq=False, frozen=True) -class BigFrameNode(abc.ABC): - """ - Immutable node for representing 2D typed array as a tree of operators. - - All subclasses must be hashable so as to be usable as caching key. - """ - - @property - def deterministic(self) -> bool: - """Whether this node will evaluates deterministically.""" - return True - - @property - def row_preserving(self) -> bool: - """Whether this node preserves input rows.""" - return True - - @property - def non_local(self) -> bool: - """ - Whether this node combines information across multiple rows instead of processing rows independently. - Used as an approximation for whether the expression may require shuffling to execute (and therefore be expensive). - """ - return False - - @property - def child_nodes(self) -> typing.Sequence[BigFrameNode]: - """Direct children of this node""" - return tuple([]) - - @property - @abc.abstractmethod - def row_count(self) -> typing.Optional[int]: - return None - - @abc.abstractmethod - def remap_vars( - self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] - ) -> Self: - """Remap defined (in this node only) variables.""" - ... - - @abc.abstractmethod - def remap_refs( - self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] - ) -> Self: - """Remap variable references""" - ... - - @property - @abc.abstractmethod - def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: - """The variables defined in this node (as opposed to by child nodes).""" - ... - - @property - def referenced_ids(self) -> COLUMN_SET: - return frozenset() - - @functools.cached_property - def session(self): - sessions = [] - for child in self.child_nodes: - if child.session is not None: - sessions.append(child.session) - unique_sessions = len(set(sessions)) - if unique_sessions > 1: - raise ValueError("Cannot use combine sources from multiple sessions.") - elif unique_sessions == 1: - return sessions[0] - return None - - def _validate(self): - """Validate the local data in the node.""" - return - - @functools.cache - def validate_tree(self) -> bool: - for child in self.child_nodes: - child.validate_tree() - self._validate() - field_list = list(self.fields) - if len(set(field_list)) != len(field_list): - raise ValueError(f"Non unique field ids {list(self.fields)}") - return True - - def _as_tuple(self) -> Tuple: - """Get all fields as tuple.""" - return tuple(getattr(self, field.name) for field in dataclasses.fields(self)) - - def __hash__(self) -> int: - # Custom hash that uses cache to avoid costly recomputation - return self._cached_hash - - def __eq__(self, other) -> bool: - # Custom eq that tries to short-circuit full structural comparison - if not isinstance(other, self.__class__): - return False - if self is other: - return True - if hash(self) != hash(other): - return False - return self._as_tuple() == other._as_tuple() - - # BigFrameNode trees can be very deep so its important avoid recalculating the hash from scratch - # Each subclass of BigFrameNode should use this property to implement __hash__ - # The default dataclass-generated __hash__ method is not cached - @functools.cached_property - def _cached_hash(self): - return hash(self._as_tuple()) - - @property - def roots(self) -> typing.Set[BigFrameNode]: - roots = itertools.chain.from_iterable( - map(lambda child: child.roots, self.child_nodes) - ) - return set(roots) - - # TODO: Store some local data lazily for select, aggregate nodes. - @property - @abc.abstractmethod - def fields(self) -> Iterable[Field]: - ... - - @property - def ids(self) -> Iterable[identifiers.ColumnId]: - """All output ids from the node.""" - return (field.id for field in self.fields) - - @property - @abc.abstractmethod - def variables_introduced(self) -> int: - """ - Defines number of values created by the current node. Helps represent the "width" of a query - """ - ... - - @property - def relation_ops_created(self) -> int: - """ - Defines the number of relational ops generated by the current node. Used to estimate query planning complexity. - """ - return 1 - - @property - def joins(self) -> bool: - """ - Defines whether the node joins data. - """ - return False - - @property - @abc.abstractmethod - def order_ambiguous(self) -> bool: - """ - Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. - """ - ... - - @property - @abc.abstractmethod - def explicitly_ordered(self) -> bool: - """ - Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. - """ - ... - - @functools.cached_property - def height(self) -> int: - if len(self.child_nodes) == 0: - return 0 - return max(child.height for child in self.child_nodes) + 1 - - @functools.cached_property - def total_variables(self) -> int: - return self.variables_introduced + sum( - map(lambda x: x.total_variables, self.child_nodes) - ) - - @functools.cached_property - def total_relational_ops(self) -> int: - return self.relation_ops_created + sum( - map(lambda x: x.total_relational_ops, self.child_nodes) - ) - - @functools.cached_property - def total_joins(self) -> int: - return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes)) - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - # TODO: Make schema just a view on fields - return schemata.ArraySchema( - tuple(schemata.SchemaItem(i.id.name, i.dtype) for i in self.fields) - ) - - @property - def planning_complexity(self) -> int: - """ - Empirical heuristic measure of planning complexity. - - Used to determine when to decompose overly complex computations. May require tuning. - """ - return self.total_variables * self.total_relational_ops * (1 + self.total_joins) - - @abc.abstractmethod - def transform_children( - self: Self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> Self: - """Apply a function to each child node.""" - ... - - @property - def defines_namespace(self) -> bool: - """ - If true, this node establishes a new column id namespace. - - If false, this node consumes and produces ids in the namespace - """ - return False - - def get_type(self, id: identifiers.ColumnId) -> bigframes.dtypes.Dtype: - return self._dtype_lookup[id] - - @functools.cached_property - def _dtype_lookup(self): - return {field.id: field.dtype for field in self.fields} - class AdditiveNode: """Definition of additive - if you drop added_fields, you end up with the descendent. @@ -455,7 +216,7 @@ def explicitly_ordered(self) -> bool: @property def added_fields(self) -> Tuple[Field, ...]: - return (Field(self.indicator_col, bigframes.dtypes.BOOL_DTYPE),) + return (Field(self.indicator_col, bigframes.dtypes.BOOL_DTYPE, nullable=False),) @property def fields(self) -> Iterable[Field]: @@ -489,6 +250,12 @@ def referenced_ids(self) -> COLUMN_SET: def additive_base(self) -> BigFrameNode: return self.left_child + @property + def joins_nulls(self) -> bool: + left_nullable = self.left_child.field_by_id[self.left_col.id].nullable + right_nullable = self.right_child.field_by_id[self.right_col.id].nullable + return left_nullable or right_nullable + def replace_additive_base(self, node: BigFrameNode): return dataclasses.replace(self, left_child=node) @@ -549,7 +316,23 @@ def explicitly_ordered(self) -> bool: @property def fields(self) -> Iterable[Field]: - return itertools.chain(self.left_child.fields, self.right_child.fields) + left_fields = self.left_child.fields + if self.type in ("right", "outer"): + left_fields = map(lambda x: x.with_nullable(), left_fields) + right_fields = self.right_child.fields + if self.type in ("left", "outer"): + right_fields = map(lambda x: x.with_nullable(), right_fields) + return itertools.chain(left_fields, right_fields) + + @property + def joins_nulls(self) -> bool: + for left_ref, right_ref in self.conditions: + if ( + self.left_child.field_by_id[left_ref.id].nullable + and self.right_child.field_by_id[right_ref.id].nullable + ): + return True + return False @functools.cached_property def variables_introduced(self) -> int: @@ -642,6 +425,7 @@ def explicitly_ordered(self) -> bool: @property def fields(self) -> Iterable[Field]: # TODO: Output names should probably be aligned beforehand or be part of concat definition + # TODO: Handle nullability return ( Field(id, field.dtype) for id, field in zip(self.output_ids, self.children[0].fields) @@ -715,7 +499,9 @@ def explicitly_ordered(self) -> bool: @functools.cached_property def fields(self) -> Iterable[Field]: - return (Field(self.output_id, next(iter(self.start.fields)).dtype),) + return ( + Field(self.output_id, next(iter(self.start.fields)).dtype, nullable=False), + ) @functools.cached_property def variables_introduced(self) -> int: @@ -794,13 +580,14 @@ class ScanList: @dataclasses.dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): # TODO: Combine feather_bytes, data_schema, n_rows into a LocalDataDef struct + # TODO: Track nullability for local data feather_bytes: bytes data_schema: schemata.ArraySchema n_rows: int # Mapping of local ids to bfet id. scan_list: ScanList # Offsets are generated only if this is non-null - offsets_col: Optional[bigframes.core.identifiers.ColumnId] = None + offsets_col: Optional[identifiers.ColumnId] = None session: typing.Optional[bigframes.session.Session] = None @property @@ -808,7 +595,8 @@ def fields(self) -> Iterable[Field]: fields = (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) if self.offsets_col is not None: return itertools.chain( - fields, (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE),) + fields, + (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False),), ) return fields @@ -894,6 +682,11 @@ def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable: else tuple(table.clustering_fields), ) + @property + @functools.cache + def schema_by_id(self): + return {col.name: col for col in self.physical_schema} + @dataclasses.dataclass(frozen=True) class BigqueryDataSource: @@ -936,7 +729,10 @@ def session(self): @property def fields(self) -> Iterable[Field]: - return (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + return ( + Field(col_id, dtype, self.source.table.schema_by_id[source_id].is_nullable) + for col_id, dtype, source_id in self.scan_list.items + ) @property def relation_ops_created(self) -> int: @@ -1039,7 +835,7 @@ class CachedTableNode(ReadTableNode): # Unary nodes @dataclasses.dataclass(frozen=True, eq=False) class PromoteOffsetsNode(UnaryNode, AdditiveNode): - col_id: bigframes.core.identifiers.ColumnId + col_id: identifiers.ColumnId @property def non_local(self) -> bool: @@ -1047,9 +843,7 @@ def non_local(self) -> bool: @property def fields(self) -> Iterable[Field]: - return itertools.chain( - self.child.fields, [Field(self.col_id, bigframes.dtypes.INT_DTYPE)] - ) + return itertools.chain(self.child.fields, self.added_fields) @property def relation_ops_created(self) -> int: @@ -1073,7 +867,7 @@ def referenced_ids(self) -> COLUMN_SET: @property def added_fields(self) -> Tuple[Field, ...]: - return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) + return (Field(self.col_id, bigframes.dtypes.INT_DTYPE, nullable=False),) @property def additive_base(self) -> BigFrameNode: @@ -1095,6 +889,7 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class FilterNode(UnaryNode): + # TODO: Infer null constraints from predicate predicate: ex.Expression @property @@ -1264,8 +1059,13 @@ def _validate(self): @functools.cached_property def fields(self) -> Iterable[Field]: + input_fields_by_id = {field.id: field for field in self.child.fields} return tuple( - Field(output, self.child.get_type(ref.id)) + Field( + output, + input_fields_by_id[ref.id].dtype, + input_fields_by_id[ref.id].nullable, + ) for ref, output in self.input_output_pairs ) @@ -1317,9 +1117,7 @@ def remap_refs( class ProjectionNode(UnaryNode, AdditiveNode): """Assigns new variables (without modifying existing ones)""" - assignments: typing.Tuple[ - typing.Tuple[ex.Expression, bigframes.core.identifiers.ColumnId], ... - ] + assignments: typing.Tuple[typing.Tuple[ex.Expression, identifiers.ColumnId], ...] def _validate(self): input_types = self.child._dtype_lookup @@ -1332,10 +1130,22 @@ def _validate(self): @functools.cached_property def added_fields(self) -> Tuple[Field, ...]: input_types = self.child._dtype_lookup - return tuple( - Field(id, bigframes.dtypes.dtype_for_etype(ex.output_type(input_types))) - for ex, id in self.assignments - ) + + fields = [] + for expr, id in self.assignments: + field = Field( + id, + bigframes.dtypes.dtype_for_etype(expr.output_type(input_types)), + nullable=expr.nullable, + ) + # Special case until we get better nullability inference in expression objects themselves + if expr.is_identity and not any( + self.child.field_by_id[id].nullable for id in expr.column_references + ): + field = field.with_nonnull() + fields.append(field) + + return tuple(fields) @property def fields(self) -> Iterable[Field]: @@ -1410,7 +1220,7 @@ def non_local(self) -> bool: @property def fields(self) -> Iterable[Field]: - return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) + return (Field(self.col_id, bigframes.dtypes.INT_DTYPE, nullable=False),) @property def variables_introduced(self) -> int: @@ -1445,9 +1255,7 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class AggregateNode(UnaryNode): - aggregations: typing.Tuple[ - typing.Tuple[ex.Aggregation, bigframes.core.identifiers.ColumnId], ... - ] + aggregations: typing.Tuple[typing.Tuple[ex.Aggregation, identifiers.ColumnId], ...] by_column_ids: typing.Tuple[ex.DerefOp, ...] = tuple([]) order_by: Tuple[OrderingExpression, ...] = () dropna: bool = True @@ -1462,19 +1270,22 @@ def non_local(self) -> bool: @functools.cached_property def fields(self) -> Iterable[Field]: - by_items = ( - Field(ref.id, self.child.get_type(ref.id)) for ref in self.by_column_ids - ) + # TODO: Use child nullability to infer grouping key nullability + by_fields = (self.child.field_by_id[ref.id] for ref in self.by_column_ids) + if self.dropna: + by_fields = (field.with_nonnull() for field in by_fields) + # TODO: Label aggregate ops to determine which are guaranteed non-null agg_items = ( Field( id, bigframes.dtypes.dtype_for_etype( agg.output_type(self.child._dtype_lookup) ), + nullable=True, ) for agg, id in self.aggregations ) - return tuple(itertools.chain(by_items, agg_items)) + return tuple(itertools.chain(by_fields, agg_items)) @property def variables_introduced(self) -> int: @@ -1539,7 +1350,7 @@ def remap_refs( class WindowOpNode(UnaryNode, AdditiveNode): expression: ex.Aggregation window_spec: window.WindowSpec - output_name: bigframes.core.identifiers.ColumnId + output_name: identifiers.ColumnId never_skip_nulls: bool = False skip_reproject_unsafe: bool = False @@ -1579,6 +1390,7 @@ def row_count(self) -> Optional[int]: @functools.cached_property def added_field(self) -> Field: input_types = self.child._dtype_lookup + # TODO: Determine if output could be non-null return Field( self.output_name, bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_types)), @@ -1682,7 +1494,7 @@ def remap_refs( class ExplodeNode(UnaryNode): column_ids: typing.Tuple[ex.DerefOp, ...] # Offsets are generated only if this is non-null - offsets_col: Optional[bigframes.core.identifiers.ColumnId] = None + offsets_col: Optional[identifiers.ColumnId] = None @property def row_preserving(self) -> bool: @@ -1696,6 +1508,7 @@ def fields(self) -> Iterable[Field]: bigframes.dtypes.arrow_dtype_to_bigframes_dtype( self.child.get_type(field.id).pyarrow_dtype.value_type # type: ignore ), + nullable=True, ) if field.id in set(map(lambda x: x.id, self.column_ids)) else field @@ -1703,7 +1516,8 @@ def fields(self) -> Iterable[Field]: ) if self.offsets_col is not None: return itertools.chain( - fields, (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE),) + fields, + (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE, nullable=False),), ) return fields @@ -1745,50 +1559,24 @@ def remap_refs( def top_down( root: BigFrameNode, transform: Callable[[BigFrameNode], BigFrameNode], - *, - memoize=False, - validate=False, ) -> BigFrameNode: """ Perform a top-down transformation of the BigFrameNode tree. - - If memoize=True, recursive calls are memoized within the scope of the traversal only. """ - - def top_down_internal(root: BigFrameNode) -> BigFrameNode: - return transform(root).transform_children(top_down_internal) - - if memoize: - # MUST reassign to the same name or caching won't work recursively - top_down_internal = functools.cache(top_down_internal) - - result = top_down_internal(root) - if validate: - result.validate_tree() - return result + return root.top_down(transform) def bottom_up( root: BigFrameNode, transform: Callable[[BigFrameNode], BigFrameNode], - *, - memoize=False, - validate=False, ) -> BigFrameNode: """ Perform a bottom-up transformation of the BigFrameNode tree. - If memoize=True, recursive calls are memoized within the scope of the traversal only. - """ - - def bottom_up_internal(root: BigFrameNode) -> BigFrameNode: - return transform(root.transform_children(bottom_up_internal)) + The `transform` function is applied to each node *after* its children + have been transformed. This allows for transformations that depend + on the results of transforming subtrees. - if memoize: - # MUST reassign to the same name or caching won't work recursively - bottom_up_internal = functools.cache(bottom_up_internal) - - result = bottom_up_internal(root) - if validate: - result.validate_tree() - return result + Returns the transformed root node. + """ + return root.bottom_up(transform) diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 7e40137f3e..5a94f2aa40 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -79,12 +79,17 @@ def prune_selection_child( elif isinstance(child, bigframes.core.nodes.AdditiveNode): if not set(field.id for field in child.added_fields) & selection.consumed_ids: return selection.replace_child(child.additive_base) - return selection.replace_child( - child.replace_additive_base( - prune_node( - child.additive_base, selection.consumed_ids | child.referenced_ids - ) + needed_ids = selection.consumed_ids | child.referenced_ids + if isinstance(child, bigframes.core.nodes.ProjectionNode): + # Projection expressions are independent, so can be individually removed from the node + child = dataclasses.replace( + child, + assignments=tuple( + (ex, id) for (ex, id) in child.assignments if id in needed_ids + ), ) + return selection.replace_child( + child.replace_additive_base(prune_node(child.additive_base, needed_ids)) ) elif isinstance(child, bigframes.core.nodes.ConcatNode): indices = [ diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index d740b28d7d..9d52eae77d 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import dataclasses import functools import typing @@ -27,6 +29,14 @@ class _TypedExpr: expr: ex.Expression dtype: dtypes.Dtype + @classmethod + def create_op_expr( + cls, op: typing.Union[ops.ScalarOp, ops.RowOp], *inputs: _TypedExpr + ) -> _TypedExpr: + expr = op.as_expr(*tuple(x.expr for x in inputs)) # type: ignore + dtype = op.output_type(*tuple(x.dtype for x in inputs)) + return cls(expr, dtype) + def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNode: """ @@ -38,12 +48,27 @@ def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNod (_rewrite_expressions(expr, root.schema).expr, column_id) for expr, column_id in root.assignments ) - root = nodes.ProjectionNode(root.child, updated_assignments) + return nodes.ProjectionNode(root.child, updated_assignments) + + if isinstance(root, nodes.FilterNode): + return nodes.FilterNode( + root.child, _rewrite_expressions(root.predicate, root.schema).expr + ) + + if isinstance(root, nodes.OrderByNode): + by = tuple(_rewrite_ordering_expr(x, root.schema) for x in root.by) + return nodes.OrderByNode(root.child, by) - # TODO(b/394354614): FilterByNode and OrderNode also contain expressions. Need to update them too. return root +def _rewrite_ordering_expr( + expr: nodes.OrderingExpression, schema: schema.ArraySchema +) -> nodes.OrderingExpression: + by = _rewrite_expressions(expr.scalar_expression, schema).expr + return nodes.OrderingExpression(by, expr.direction, expr.na_last) + + @functools.cache def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _TypedExpr: if isinstance(expr, ex.DerefOp): @@ -78,37 +103,66 @@ def _rewrite_op_expr( if isinstance(expr.op, ops.AddOp): return _rewrite_add_op(inputs[0], inputs[1]) - input_types = tuple(map(lambda x: x.dtype, inputs)) - return _TypedExpr(expr, expr.op.output_type(*input_types)) + if isinstance(expr.op, ops.MulOp): + return _rewrite_mul_op(inputs[0], inputs[1]) + + if isinstance(expr.op, ops.DivOp): + return _rewrite_div_op(inputs[0], inputs[1]) + + if isinstance(expr.op, ops.FloorDivOp): + # We need to re-write floor div because for numerics: int // float => float + # but for timedeltas: int(timedelta) // float => int(timedelta) + return _rewrite_floordiv_op(inputs[0], inputs[1]) + + return _TypedExpr.create_op_expr(expr.op, *inputs) def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: - result_op: ops.BinaryOp = ops.sub_op if dtypes.is_datetime_like(left.dtype) and dtypes.is_datetime_like(right.dtype): - result_op = ops.timestamp_diff_op + return _TypedExpr.create_op_expr(ops.timestamp_diff_op, left, right) - return _TypedExpr( - result_op.as_expr(left.expr, right.expr), - result_op.output_type(left.dtype, right.dtype), - ) + if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right) + + return _TypedExpr.create_op_expr(ops.sub_op, left, right) def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: - return _TypedExpr( - ops.timestamp_add_op.as_expr(left.expr, right.expr), - ops.timestamp_add_op.output_type(left.dtype, right.dtype), - ) + return _TypedExpr.create_op_expr(ops.timestamp_add_op, left, right) if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): # Re-arrange operands such that timestamp is always on the left and timedelta is # always on the right. - return _TypedExpr( - ops.timestamp_add_op.as_expr(right.expr, left.expr), - ops.timestamp_add_op.output_type(right.dtype, left.dtype), - ) + return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left) + + return _TypedExpr.create_op_expr(ops.add_op, left, right) + + +def _rewrite_mul_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + result = _TypedExpr.create_op_expr(ops.mul_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + if dtypes.is_numeric(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + + return result + + +def _rewrite_div_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + result = _TypedExpr.create_op_expr(ops.div_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) + + return result + + +def _rewrite_floordiv_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + result = _TypedExpr.create_op_expr(ops.floordiv_op, left, right) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): + return _TypedExpr.create_op_expr(ops.ToTimedeltaOp("us"), result) - return _TypedExpr( - ops.add_op.as_expr(left.expr, right.expr), - ops.add_op.output_type(left.dtype, right.dtype), - ) + return result diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index d893356207..82df53af82 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -88,7 +88,7 @@ def select_cache_target( @functools.cache def _with_caching(subtree: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down(subtree, lambda x: cache.get(x, x), memoize=True) + return nodes.top_down(subtree, lambda x: cache.get(x, x)) def _combine_counts( left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int] diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 0198f12537..502a40d92d 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -24,6 +24,7 @@ import pandas.api.types as pdtypes import typing_extensions +import bigframes.dtypes as dtypes import bigframes.exceptions as bfe UNNAMED_COLUMN_ID = "bigframes_unnamed_column" @@ -226,3 +227,24 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: updated_columns.append(dataframe.index.name) return updated_columns + + +def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: + """ + Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249), + we're using a workaround: storing JSON as strings and then parsing them into JSON + objects. + TODO(b/395912450): Remove workaround solution once b/374784249 got resolved. + """ + updated_columns = [] + + for col in dataframe.columns: + if dataframe[col].dtype == dtypes.JSON_DTYPE: + dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE) + updated_columns.append(col) + + if dataframe.index.dtype == dtypes.JSON_DTYPE: + dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE) + updated_columns.append(dataframe.index.name) + + return updated_columns diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4ffa56c2e5..c02b182ee3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -103,6 +103,8 @@ class DataFrame(vendored_pandas_frame.DataFrame): __doc__ = vendored_pandas_frame.DataFrame.__doc__ # internal flag to disable cache at all _disable_cache_override: bool = False + # Must be above 5000 for pandas to delegate to bigframes for binops + __pandas_priority__ = 15000 def __init__( self, diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index eed45e1dde..e4db904210 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -301,6 +301,7 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: return type_ in ("object", "O") or ( getattr(type_, "kind", None) == "O" and getattr(type_, "storage", None) != "pyarrow" + and getattr(type_, "name", None) != "dbjson" ) @@ -357,7 +358,7 @@ def is_comparable(type_: ExpressionType) -> bool: def is_orderable(type_: ExpressionType) -> bool: # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable - return type_ in _ORDERABLE_SIMPLE_TYPES + return type_ in _ORDERABLE_SIMPLE_TYPES or type_ is TIMEDELTA_DTYPE _CLUSTERABLE_SIMPLE_TYPES = set( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 88406317fe..f2bc1ecf85 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -115,15 +115,18 @@ cos_op, cosh_op, div_op, + DivOp, exp_op, expm1_op, floor_op, floordiv_op, + FloorDivOp, ln_op, log1p_op, log10_op, mod_op, mul_op, + MulOp, neg_op, pos_op, pow_op, @@ -178,7 +181,11 @@ ) from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op -from bigframes.operations.timedelta_ops import timestamp_add_op, ToTimedeltaOp +from bigframes.operations.timedelta_ops import ( + timestamp_add_op, + timestamp_sub_op, + ToTimedeltaOp, +) __all__ = [ # Base ops @@ -251,6 +258,7 @@ "normalize_op", # Timedelta ops "timestamp_add_op", + "timestamp_sub_op", "ToTimedeltaOp", # Datetime ops "date_op", @@ -277,15 +285,18 @@ "cos_op", "cosh_op", "div_op", + "DivOp", "exp_op", "expm1_op", "floor_op", "floordiv_op", + "FloorDivOp", "ln_op", "log1p_op", "log10_op", "mod_op", "mul_op", + "MulOp", "neg_op", "pos_op", "pow_op", diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 7fa4dd9633..e97515d42b 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -276,7 +276,7 @@ def image_blur( self, ksize: tuple[int, int], *, - dst: Union[str, bigframes.series.Series], + dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, ) -> bigframes.series.Series: """Blurs images. @@ -286,7 +286,7 @@ def image_blur( Args: ksize (tuple(int, int)): Kernel size. - dst (str or bigframes.series.Series): Destination GCS folder str or blob series. + dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. Returns: @@ -295,6 +295,19 @@ def image_blur( import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) + df = self._get_runtime_json_str(mode="R").to_frame() + + if dst is None: + image_blur_udf = blob_func.TransformFunction( + blob_func.image_blur_to_bytes_def, + session=self._block.session, + connection=connection, + ).udf() + + df["ksize_x"], df["ksize_y"] = ksize + res = df.apply(image_blur_udf, axis=1) + + return res if isinstance(dst, str): dst = os.path.join(dst, "") @@ -311,10 +324,9 @@ def image_blur( connection=connection, ).udf() - src_rt = self._get_runtime_json_str(mode="R") dst_rt = dst.blob._get_runtime_json_str(mode="RW") - df = src_rt.to_frame().join(dst_rt.to_frame(), how="outer") + df = df.join(dst_rt, how="outer") df["ksize_x"], df["ksize_y"] = ksize res = df.apply(image_blur_udf, axis=1) @@ -322,6 +334,151 @@ def image_blur( return dst + def image_resize( + self, + dsize: tuple[int, int] = (0, 0), + *, + fx: float = 0.0, + fy: float = 0.0, + dst: Optional[Union[str, bigframes.series.Series]] = None, + connection: Optional[str] = None, + ): + """Resize images. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size. + fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size. + fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size. + dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. + connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + + Returns: + BigFrames Blob Series + """ + dsize_set = dsize[0] > 0 and dsize[1] > 0 + fsize_set = fx > 0.0 and fy > 0.0 + if not dsize_set ^ fsize_set: + raise ValueError( + "Only one of dsize or (fx, fy) parameters must be set. And the set values must be positive. " + ) + + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + df = self._get_runtime_json_str(mode="R").to_frame() + + if dst is None: + image_resize_udf = blob_func.TransformFunction( + blob_func.image_resize_to_bytes_def, + session=self._block.session, + connection=connection, + ).udf() + + df["dsize_x"], df["dsizye_y"] = dsize + df["fx"], df["fy"] = fx, fy + res = df.apply(image_resize_udf, axis=1) + + return res + + if isinstance(dst, str): + dst = os.path.join(dst, "") + src_uri = bigframes.series.Series(self._block).struct.explode()["uri"] + # Replace src folder with dst folder, keep the file names. + dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True) + dst = cast( + bigframes.series.Series, dst_uri.str.to_blob(connection=connection) + ) + + image_resize_udf = blob_func.TransformFunction( + blob_func.image_resize_def, + session=self._block.session, + connection=connection, + ).udf() + + dst_rt = dst.blob._get_runtime_json_str(mode="RW") + + df = df.join(dst_rt, how="outer") + df["dsize_x"], df["dsizye_y"] = dsize + df["fx"], df["fy"] = fx, fy + + res = df.apply(image_resize_udf, axis=1) + res.cache() # to execute the udf + + return dst + + def image_normalize( + self, + *, + alpha: float = 1.0, + beta: float = 0.0, + norm_type: str = "l2", + dst: Optional[Union[str, bigframes.series.Series]] = None, + connection: Optional[str] = None, + ) -> bigframes.series.Series: + """Normalize images. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization. + beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization. + norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax". + dst (str or bigframes.series.Series or None, default None): Destination GCS folder str or blob series. If None, output to BQ as bytes. + connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + + Returns: + BigFrames Blob Series + """ + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + df = self._get_runtime_json_str(mode="R").to_frame() + + if dst is None: + image_normalize_udf = blob_func.TransformFunction( + blob_func.image_normalize_to_bytes_def, + session=self._block.session, + connection=connection, + ).udf() + + df["alpha"] = alpha + df["beta"] = beta + df["norm_type"] = norm_type + res = df.apply(image_normalize_udf, axis=1) + + return res + + if isinstance(dst, str): + dst = os.path.join(dst, "") + src_uri = bigframes.series.Series(self._block).struct.explode()["uri"] + # Replace src folder with dst folder, keep the file names. + dst_uri = src_uri.str.replace(r"^.*\/(.*)$", rf"{dst}\1", regex=True) + dst = cast( + bigframes.series.Series, dst_uri.str.to_blob(connection=connection) + ) + + image_normalize_udf = blob_func.TransformFunction( + blob_func.image_normalize_def, + session=self._block.session, + connection=connection, + ).udf() + + dst_rt = dst.blob._get_runtime_json_str(mode="RW") + + df = df.join(dst_rt, how="outer") + df["alpha"] = alpha + df["beta"] = beta + df["norm_type"] = norm_type + + res = df.apply(image_normalize_udf, axis=1) + res.cache() # to execute the udf + + return dst + def pdf_extract( self, *, connection: Optional[str] = None ) -> bigframes.series.Series: diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 5183e5c4c5..f5a290bde5 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -75,11 +75,17 @@ name="ceil", type_signature=op_typing.UNARY_REAL_NUMERIC ) -abs_op = base_ops.create_unary_op(name="abs", type_signature=op_typing.UNARY_NUMERIC) +abs_op = base_ops.create_unary_op( + name="abs", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA +) -pos_op = base_ops.create_unary_op(name="pos", type_signature=op_typing.UNARY_NUMERIC) +pos_op = base_ops.create_unary_op( + name="pos", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA +) -neg_op = base_ops.create_unary_op(name="neg", type_signature=op_typing.UNARY_NUMERIC) +neg_op = base_ops.create_unary_op( + name="neg", type_signature=op_typing.UNARY_NUMERIC_AND_TIMEDELTA +) exp_op = base_ops.create_unary_op( name="exp", type_signature=op_typing.UNARY_REAL_NUMERIC @@ -123,6 +129,9 @@ def output_type(self, *input_types): if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): return right_type + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): @@ -142,29 +151,102 @@ class SubOp(base_ops.BinaryOp): def output_type(self, *input_types): left_type = input_types[0] right_type = input_types[1] + + if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): + return dtypes.TIMEDELTA_DTYPE + + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + return left_type + + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): # Numeric subtraction return dtypes.coerce_to_common(left_type, right_type) - if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): - return dtypes.TIMEDELTA_DTYPE - raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") sub_op = SubOp() -mul_op = base_ops.create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC) -div_op = base_ops.create_binary_op( - name="div", type_signature=op_typing.BINARY_REAL_NUMERIC -) +@dataclasses.dataclass(frozen=True) +class MulOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "mul" -floordiv_op = base_ops.create_binary_op( - name="floordiv", type_signature=op_typing.BINARY_NUMERIC -) + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + left_type = input_types[0] + right_type = input_types[1] + + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + return dtypes.TIMEDELTA_DTYPE + if dtypes.is_numeric(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.TIMEDELTA_DTYPE + + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + return dtypes.coerce_to_common(left_type, right_type) + + raise TypeError(f"Cannot multiply dtypes {left_type} and {right_type}") + + +mul_op = MulOp() + + +@dataclasses.dataclass(frozen=True) +class DivOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "div" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + left_type = input_types[0] + right_type = input_types[1] + + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + return dtypes.TIMEDELTA_DTYPE + + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.FLOAT_DTYPE + + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + lcd_type = dtypes.coerce_to_common(left_type, right_type) + # Real numeric ops produce floats on int input + return dtypes.FLOAT_DTYPE if lcd_type == dtypes.INT_DTYPE else lcd_type + + raise TypeError(f"Cannot divide dtypes {left_type} and {right_type}") + + +div_op = DivOp() + + +@dataclasses.dataclass(frozen=True) +class FloorDivOp(base_ops.BinaryOp): + name: typing.ClassVar[str] = "floordiv" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + left_type = input_types[0] + right_type = input_types[1] + + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + return dtypes.TIMEDELTA_DTYPE + + if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE: + return dtypes.INT_DTYPE + + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + return dtypes.coerce_to_common(left_type, right_type) + + raise TypeError(f"Cannot floor divide dtypes {left_type} and {right_type}") + + +floordiv_op = FloorDivOp() pow_op = base_ops.create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC) diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 69e054fa5c..689966e21b 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -26,7 +26,11 @@ class ToTimedeltaOp(base_ops.UnaryOp): unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): + if input_types[0] in ( + dtypes.INT_DTYPE, + dtypes.FLOAT_DTYPE, + dtypes.TIMEDELTA_DTYPE, + ): return dtypes.TIMEDELTA_DTYPE raise TypeError("expected integer or float input") @@ -54,3 +58,22 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT timestamp_add_op = TimestampAdd() + + +class TimestampSub(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_sub" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # timestamp - timedelta => timestamp + if ( + dtypes.is_datetime_like(input_types[0]) + and input_types[1] is dtypes.TIMEDELTA_DTYPE + ): + return input_types[0] + + raise TypeError( + f"unsupported types for timestamp_sub. left: {input_types[0]} right: {input_types[1]}" + ) + + +timestamp_sub_op = TimestampSub() diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 86bb56fc39..0a47cd91f0 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -224,6 +224,10 @@ def output_type( # Common type signatures UNARY_NUMERIC = TypePreserving(bigframes.dtypes.is_numeric, description="numeric") +UNARY_NUMERIC_AND_TIMEDELTA = TypePreserving( + lambda x: bigframes.dtypes.is_numeric(x) or x is bigframes.dtypes.TIMEDELTA_DTYPE, + description="numeric_and_timedelta", +) UNARY_REAL_NUMERIC = UnaryRealNumeric() BINARY_NUMERIC = BinaryNumeric() BINARY_REAL_NUMERIC = BinaryRealNumeric() diff --git a/bigframes/series.py b/bigframes/series.py index af9fce6e20..fe2d1aae0e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -78,6 +78,9 @@ @log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): + # Must be above 5000 for pandas to delegate to bigframes for binops + __pandas_priority__ = 13000 + def __init__(self, *args, **kwargs): self._query_job: Optional[bigquery.QueryJob] = None super().__init__(*args, **kwargs) @@ -961,6 +964,9 @@ def update(self, other: Union[Series, Sequence, Mapping]) -> None: ) self._set_block(result._get_block()) + def __abs__(self) -> Series: + return self.abs() + def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 532a909430..a1549238b3 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -14,7 +14,8 @@ from __future__ import annotations import dataclasses -from typing import Collection, List, Union +import typing +from typing import Collection, Union import bigframes_vendored.constants as constants import db_dtypes # type: ignore @@ -38,7 +39,7 @@ class DataFrameAndLabels: column_labels: Collection index_labels: Collection ordering_col: str - timedelta_cols: List[str] + col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] def _arrow_to_pandas_arrowdtype( @@ -165,11 +166,16 @@ def pandas_to_bq_compatible(pandas_dataframe: pandas.DataFrame) -> DataFrameAndL pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) timedelta_cols = utils.replace_timedeltas_with_micros(pandas_dataframe_copy) + json_cols = utils.replace_json_with_string(pandas_dataframe_copy) + col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { + **{col: bigframes.dtypes.TIMEDELTA_DTYPE for col in timedelta_cols}, + **{col: bigframes.dtypes.JSON_DTYPE for col in json_cols}, + } return DataFrameAndLabels( df=pandas_dataframe_copy, column_labels=col_labels, index_labels=idx_labels, ordering_col=ordering_col, - timedelta_cols=timedelta_cols, + col_type_overrides=col_type_overrides, ) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 553c3fd6e6..502692929d 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -513,9 +513,7 @@ def _run_execute_query( raise def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - return nodes.top_down( - node, lambda x: self._cached_executions.get(x, x), memoize=True - ) + return nodes.top_down(node, lambda x: self._cached_executions.get(x, x)) def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): """ diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index b7550583e5..7204a14870 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -176,15 +176,11 @@ def read_pandas_load_job( self._start_generic_job(load_job) destination_table = self._bqclient.get_table(load_table_destination) - col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDELTA_DTYPE - for col in df_and_labels.timedelta_cols - } array_value = core.ArrayValue.from_table( table=destination_table, # TODO (b/394156190): Generate this directly from original pandas df. schema=schemata.ArraySchema.from_bq_table( - destination_table, col_type_overrides + destination_table, df_and_labels.col_type_overrides ), session=self._session, offsets_col=ordering_col, @@ -234,16 +230,11 @@ def read_pandas_streaming( raise ValueError( f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}" ) - - col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDELTA_DTYPE - for col in df_and_labels.timedelta_cols - } array_value = ( core.ArrayValue.from_table( table=destination_table, schema=schemata.ArraySchema.from_bq_table( - destination_table, col_type_overrides + destination_table, df_and_labels.col_type_overrides ), session=self._session, # Don't set the offsets column because we want to group by it. diff --git a/bigframes/version.py b/bigframes/version.py index e92072bea8..27dfb23603 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.36.0" +__version__ = "1.37.0" diff --git a/docs/reference/bigframes.geopandas/geoseries.rst b/docs/reference/bigframes.geopandas/geoseries.rst index 91e853b1f8..481eb73b9d 100644 --- a/docs/reference/bigframes.geopandas/geoseries.rst +++ b/docs/reference/bigframes.geopandas/geoseries.rst @@ -8,8 +8,8 @@ GeoSeries :local: :backlinks: none -Series ------- +GeoSeries +--------- .. autoclass:: bigframes.geopandas.GeoSeries :members: diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index d1610accdd..bc9f714416 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -3,6 +3,14 @@ DataFrame ========= +.. contents:: Table of Contents + :depth: 2 + :local: + :backlinks: none + +DataFrame +--------- + .. autoclass:: bigframes.dataframe.DataFrame :members: :inherited-members: @@ -18,3 +26,11 @@ Plotting handling :members: :inherited-members: :undoc-members: + +Struct handling +^^^^^^^^^^^^^^^ + +.. autoclass:: bigframes.operations.structs.StructFrameAccessor + :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.pandas/series.rst b/docs/reference/bigframes.pandas/series.rst index 30cf851de7..547b262591 100644 --- a/docs/reference/bigframes.pandas/series.rst +++ b/docs/reference/bigframes.pandas/series.rst @@ -46,7 +46,7 @@ List handling Struct handling ^^^^^^^^^^^^^^^ -.. automodule:: bigframes.operations.structs +.. autoclass:: bigframes.operations.structs.StructAccessor :members: :inherited-members: :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index d57ab1c8ac..b4f513b11d 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -38,8 +38,13 @@ uid: bigframes.exceptions name: Core Objects - items: - - name: DataFrame - uid: bigframes.dataframe.DataFrame + - items: + - name: DataFrame + uid: bigframes.dataframe.DataFrame + - name: PlotAccessor + uid: bigframes.operations.plotting.PlotAccessor + - name: StructAccessor + uid: bigframes.operations.structs.StructFrameAccessor - items: - name: DataFrameGroupBy uid: bigframes.core.groupby.DataFrameGroupBy @@ -68,12 +73,6 @@ name: Indexers - name: pandas uid: bigframes.pandas - - items: - - name: Plotting - uid: bigframes.operations.plotting - - name: PlotAccessor - uid: bigframes.operations.plotting.PlotAccessor - name: Plotting - items: - name: Series uid: bigframes.series.Series @@ -83,6 +82,8 @@ uid: bigframes.operations.strings.StringMethods - name: StructAccessor uid: bigframes.operations.structs.StructAccessor + - name: PlotAccessor + uid: bigframes.operations.plotting.PlotAccessor name: Series - name: Window uid: bigframes.core.window.Window diff --git a/scripts/create_gcs.py b/scripts/create_gcs.py new file mode 100644 index 0000000000..8a94bfd886 --- /dev/null +++ b/scripts/create_gcs.py @@ -0,0 +1,96 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script create the bigtable resources required for +# bigframes.streaming testing if they don't already exist + +import os +from pathlib import Path +import sys + +import google.cloud.exceptions as exceptions +from google.cloud.storage import transfer_manager +import google.cloud.storage as gcs + +PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") + +if not PROJECT_ID: + print( + "Please set GOOGLE_CLOUD_PROJECT environment variable before running.", + file=sys.stderr, + ) + sys.exit(1) + + +def create_bucket(client: gcs.Client) -> gcs.Bucket: + bucket_name = "bigframes_blob_test" + + print(f"Creating bucket: {bucket_name}") + try: + bucket = client.create_bucket(bucket_name) + print(f"Bucket {bucket_name} created. ") + + except exceptions.Conflict: + print(f"Bucket {bucket_name} already exists.") + bucket = client.bucket(bucket_name) + + return bucket + + +def upload_data(bucket: gcs.Bucket): + # from https://cloud.google.com/storage/docs/samples/storage-transfer-manager-upload-directory + source_directory = "scripts/data/" + workers = 8 + + # First, recursively get all files in `directory` as Path objects. + directory_as_path_obj = Path(source_directory) + paths = directory_as_path_obj.rglob("*") + + # Filter so the list only includes files, not directories themselves. + file_paths = [path for path in paths if path.is_file()] + + # These paths are relative to the current working directory. Next, make them + # relative to `directory` + relative_paths = [path.relative_to(source_directory) for path in file_paths] + + # Finally, convert them all to strings. + string_paths = [str(path) for path in relative_paths] + + print("Found {} files.".format(len(string_paths))) + + # Start the upload. + results = transfer_manager.upload_many_from_filenames( + bucket, string_paths, source_directory=source_directory, max_workers=workers + ) + + for name, result in zip(string_paths, results): + # The results list is either `None` or an exception for each filename in + # the input list, in order. + + if isinstance(result, Exception): + print("Failed to upload {} due to exception: {}".format(name, result)) + else: + print("Uploaded {} to {}.".format(name, bucket.name)) + + +def main(): + client = gcs.Client(project=PROJECT_ID) + + bucket = create_bucket(client) + + upload_data(bucket) + + +if __name__ == "__main__": + main() diff --git a/scripts/data/images/img0.jpg b/scripts/data/images/img0.jpg new file mode 100644 index 0000000000..4f9114402b Binary files /dev/null and b/scripts/data/images/img0.jpg differ diff --git a/scripts/data/images/img1.jpg b/scripts/data/images/img1.jpg new file mode 100644 index 0000000000..15c881bd1a Binary files /dev/null and b/scripts/data/images/img1.jpg differ diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 8f97856eea..492c0cf9b6 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json - +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import pandas as pd import pyarrow as pa @@ -24,19 +23,6 @@ import bigframes.pandas as bpd -def _get_series_from_json(json_data): - # Note: converts None to sql "null" and not to json none. - values = [ - f"JSON '{json.dumps(data)}'" if data is not None else "NULL" - for data in json_data - ] - sql = " UNION ALL ".join( - [f"SELECT {id} AS id, {value} AS data" for id, value in enumerate(values)] - ) - df = bpd.read_gbq(sql).set_index("id").sort_index() - return df["data"] - - @pytest.mark.parametrize( ("json_path", "expected_json"), [ @@ -45,10 +31,11 @@ def _get_series_from_json(json_data): ], ) def test_json_set_at_json_path(json_path, expected_json): - s = _get_series_from_json([{"a": {"b": {"c": "tester", "d": []}}}]) + original_json = [{"a": {"b": {"c": "tester", "d": []}}}] + s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) - expected = _get_series_from_json(expected_json) + expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -65,10 +52,11 @@ def test_json_set_at_json_path(json_path, expected_json): ], ) def test_json_set_at_json_value_type(json_value, expected_json): - s = _get_series_from_json([{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]) + original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}] + s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected = _get_series_from_json(expected_json) + expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -76,13 +64,14 @@ def test_json_set_at_json_value_type(json_value, expected_json): def test_json_set_w_more_pairs(): - s = _get_series_from_json([{"a": 2}, {"b": 5}, {"c": 1}]) + original_json = [{"a": 2}, {"b": 5}, {"c": 1}] + s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) actual = bbq.json_set( s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] ) - expected = _get_series_from_json( - [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] - ) + + expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] + expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -90,16 +79,16 @@ def test_json_set_w_more_pairs(): def test_json_set_w_invalid_json_path_value_pairs(): + s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) with pytest.raises(ValueError): - bbq.json_set( - _get_series_from_json([{"a": 10}]), json_path_value_pairs=[("$.a", 1, 100)] # type: ignore - ) + bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore def test_json_set_w_invalid_value_type(): + s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) with pytest.raises(TypeError): bbq.json_set( - _get_series_from_json([{"a": 10}]), + s, json_path_value_pairs=[ ( "$.a", @@ -117,9 +106,12 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): - s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) + s = bpd.Series( + [{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}], + dtype=db_dtypes.JSONDtype(), + ) actual = bbq.json_extract(s, "$.a.b").to_pandas() - expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() + expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas() pd.testing.assert_series_equal( actual, expected, @@ -127,9 +119,12 @@ def test_json_extract_from_json(): def test_json_extract_from_string(): - s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) + s = bpd.Series( + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=pd.StringDtype(storage="pyarrow"), + ) actual = bbq.json_extract(s, "$.a.b") - expected = bpd.Series(["[1,2]", None, "0"]) + expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -142,8 +137,9 @@ def test_json_extract_w_invalid_series_type(): def test_json_extract_array_from_json(): - s = _get_series_from_json( - [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}] + s = bpd.Series( + [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}], + dtype=db_dtypes.JSONDtype(), ) actual = bbq.json_extract_array(s, "$.a") @@ -160,6 +156,8 @@ def test_json_extract_array_from_json(): """ df = bpd.read_gbq(sql).set_index("id").sort_index() expected = df["data"] + expected.index.name = None + expected.name = None pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py new file mode 100644 index 0000000000..effadd3b22 --- /dev/null +++ b/tests/system/small/blob/test_io.py @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes +import bigframes.pandas as bpd + + +def test_blob_create_from_uri_str(): + bigframes.options.experiments.blob = True + + uri_series = bpd.Series( + [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ] + ) + # TODO: use bq_connection fixture when MMD location capitalization fix is in prod + blob_series = uri_series.str.to_blob(connection="us.bigframes-default-connection") + + pd_blob_series = blob_series.to_pandas() + + assert len(pd_blob_series) == 2 diff --git a/tests/system/small/core/test_indexers.py b/tests/system/small/core/test_indexers.py index 2c670f790d..20f1c56185 100644 --- a/tests/system/small/core/test_indexers.py +++ b/tests/system/small/core/test_indexers.py @@ -54,26 +54,10 @@ def string_indexed_number_series(session): ) -def test_non_string_indexed_struct_series_with_string_key_should_warn(session): - s = bpd.Series( - [ - {"project": "pandas", "version": 1}, - ], - dtype=bpd.ArrowDtype( - pa.struct([("project", pa.string()), ("version", pa.int64())]) - ), - session=session, - ) - - with pytest.warns(bigframes.exceptions.BadIndexerKeyWarning): - s["a"] - - @pytest.mark.parametrize( "series", [ "string_indexed_struct_series", - "number_series", "string_indexed_number_series", ], ) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py index 6c44a62686..356000b3f6 100644 --- a/tests/system/small/operations/test_timedeltas.py +++ b/tests/system/small/operations/test_timedeltas.py @@ -14,12 +14,15 @@ import datetime +import operator import numpy as np import pandas as pd import pandas.testing import pytest +from bigframes import dtypes + @pytest.fixture(scope="module") def temporal_dfs(session): @@ -28,12 +31,24 @@ def temporal_dfs(session): "datetime_col": [ pd.Timestamp("2025-02-01 01:00:01"), pd.Timestamp("2019-01-02 02:00:00"), + pd.Timestamp("1997-01-01 19:00:00"), ], "timestamp_col": [ pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), + pd.Timestamp("2005-03-05 02:00:00", tz="UTC"), + ], + "timedelta_col_1": [ + pd.Timedelta(5, "s"), + pd.Timedelta(-4, "d"), + pd.Timedelta(5, "h"), + ], + "timedelta_col_2": [ + pd.Timedelta(3, "s"), + pd.Timedelta(-4, "d"), + pd.Timedelta(6, "h"), ], - "timedelta_col": [pd.Timedelta(3, "s"), pd.Timedelta(-4, "d")], + "numeric_col": [1.5, 2, -3], } ) @@ -42,6 +57,100 @@ def temporal_dfs(session): return bigframes_df, pandas_df +def _assert_series_equal(actual: pd.Series, expected: pd.Series): + """Helper function specifically for timedelta testsing. Don't use it outside of this module.""" + if actual.dtype == dtypes.FLOAT_DTYPE: + pandas.testing.assert_series_equal( + actual, expected.astype("Float64"), check_index_type=False + ) + elif actual.dtype == dtypes.INT_DTYPE: + pandas.testing.assert_series_equal( + actual, expected.astype("Int64"), check_index_type=False + ) + else: + pandas.testing.assert_series_equal( + actual.astype("timedelta64[ns]"), + expected.dt.floor("us"), # in BF the precision is microsecond + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("op", "col_1", "col_2"), + [ + (operator.add, "timedelta_col_1", "timedelta_col_2"), + (operator.sub, "timedelta_col_1", "timedelta_col_2"), + (operator.truediv, "timedelta_col_1", "timedelta_col_2"), + (operator.floordiv, "timedelta_col_1", "timedelta_col_2"), + (operator.truediv, "timedelta_col_1", "numeric_col"), + (operator.floordiv, "timedelta_col_1", "numeric_col"), + (operator.mul, "timedelta_col_1", "numeric_col"), + (operator.mul, "numeric_col", "timedelta_col_1"), + ], +) +def test_timedelta_binary_ops_between_series(temporal_dfs, op, col_1, col_2): + bf_df, pd_df = temporal_dfs + + actual_result = op(bf_df[col_1], bf_df[col_2]).to_pandas() + + expected_result = op(pd_df[col_1], pd_df[col_2]) + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + ("op", "col", "literal"), + [ + (operator.add, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.floordiv, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.truediv, "timedelta_col_1", 3), + (operator.floordiv, "timedelta_col_1", 3), + (operator.mul, "timedelta_col_1", 3), + (operator.mul, "numeric_col", pd.Timedelta(1, "s")), + ], +) +def test_timedelta_binary_ops_series_and_literal(temporal_dfs, op, col, literal): + bf_df, pd_df = temporal_dfs + + actual_result = op(bf_df[col], literal).to_pandas() + + expected_result = op(pd_df[col], literal) + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + ("op", "col", "literal"), + [ + (operator.add, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.sub, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.truediv, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.floordiv, "timedelta_col_1", pd.Timedelta(2, "s")), + (operator.truediv, "numeric_col", pd.Timedelta(2, "s")), + (operator.floordiv, "numeric_col", pd.Timedelta(2, "s")), + (operator.mul, "timedelta_col_1", 3), + (operator.mul, "numeric_col", pd.Timedelta(1, "s")), + ], +) +def test_timedelta_binary_ops_literal_and_series(temporal_dfs, op, col, literal): + bf_df, pd_df = temporal_dfs + + actual_result = op(literal, bf_df[col]).to_pandas() + + expected_result = op(literal, pd_df[col]) + _assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize("op", [operator.pos, operator.neg, operator.abs]) +def test_timedelta_unary_ops(temporal_dfs, op): + bf_df, pd_df = temporal_dfs + + actual_result = op(bf_df["timedelta_col_1"]).to_pandas() + + expected_result = op(pd_df["timedelta_col_1"]) + _assert_series_equal(actual_result, expected_result) + + @pytest.mark.parametrize( ("column", "pd_dtype"), [ @@ -53,10 +162,10 @@ def test_timestamp_add__ts_series_plus_td_series(temporal_dfs, column, pd_dtype) bf_df, pd_df = temporal_dfs actual_result = ( - (bf_df[column] + bf_df["timedelta_col"]).to_pandas().astype(pd_dtype) + (bf_df[column] + bf_df["timedelta_col_1"]).to_pandas().astype(pd_dtype) ) - expected_result = pd_df[column] + pd_df["timedelta_col"] + expected_result = pd_df[column] + pd_df["timedelta_col_1"] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -94,10 +203,10 @@ def test_timestamp_add__td_series_plus_ts_series(temporal_dfs, column, pd_dtype) bf_df, pd_df = temporal_dfs actual_result = ( - (bf_df["timedelta_col"] + bf_df[column]).to_pandas().astype(pd_dtype) + (bf_df["timedelta_col_1"] + bf_df[column]).to_pandas().astype(pd_dtype) ) - expected_result = pd_df["timedelta_col"] + pd_df[column] + expected_result = pd_df["timedelta_col_1"] + pd_df[column] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -120,10 +229,10 @@ def test_timestamp_add__ts_literal_plus_td_series(temporal_dfs): timestamp = pd.Timestamp("2025-01-01", tz="UTC") actual_result = ( - (timestamp + bf_df["timedelta_col"]).to_pandas().astype("datetime64[ns, UTC]") + (timestamp + bf_df["timedelta_col_1"]).to_pandas().astype("datetime64[ns, UTC]") ) - expected_result = timestamp + pd_df["timedelta_col"] + expected_result = timestamp + pd_df["timedelta_col_1"] pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -140,10 +249,10 @@ def test_timestamp_add_with_numpy_op(temporal_dfs, column, pd_dtype): bf_df, pd_df = temporal_dfs actual_result = ( - np.add(bf_df[column], bf_df["timedelta_col"]).to_pandas().astype(pd_dtype) + np.add(bf_df[column], bf_df["timedelta_col_1"]).to_pandas().astype(pd_dtype) ) - expected_result = np.add(pd_df[column], pd_df["timedelta_col"]) + expected_result = np.add(pd_df[column], pd_df["timedelta_col_1"]) pandas.testing.assert_series_equal( actual_result, expected_result, check_index_type=False ) @@ -164,3 +273,195 @@ def test_timestamp_add_dataframes(temporal_dfs): pandas.testing.assert_frame_equal( actual_result, expected_result, check_index_type=False ) + + +@pytest.mark.parametrize( + ("column", "pd_dtype"), + [ + ("datetime_col", " pd.Timedelta(1, "h"))] + .to_pandas() + .astype(" pd.Timedelta(1, "h")] + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +def test_timedelta_ordering(session): + pd_df = pd.DataFrame( + { + "col_1": [ + pd.Timestamp("2025-01-01 01:00:00"), + pd.Timestamp("2025-01-01 02:00:00"), + pd.Timestamp("2025-01-01 03:00:00"), + ], + "col_2": [ + pd.Timestamp("2025-01-01 01:00:02"), + pd.Timestamp("2025-01-01 02:00:01"), + pd.Timestamp("2025-01-01 02:59:59"), + ], + } + ) + bf_df = session.read_pandas(pd_df) + + actual_result = ( + (bf_df["col_2"] - bf_df["col_1"]) + .sort_values() + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = (pd_df["col_2"] - pd_df["col_1"]).sort_values() + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1db89a074a..26b941a596 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2471,6 +2471,20 @@ def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) +@skip_legacy_pandas +def test_df_reverse_binop_pandas(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_series = pd.Series([100, 200, 300]) + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = pd_series + scalars_df[df_columns].to_pandas() + pd_result = pd_series + scalars_pandas_df[df_columns] + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + def test_listlike_binop_axis_1_bf_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 10637b2395..f84ee811a3 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -658,6 +658,21 @@ def test_to_gbq_w_invalid_destination_table(scalars_df_index): scalars_df_index.to_gbq("table_id") +def test_to_gbq_w_json(bigquery_client): + """Test the `to_gbq` API can get a JSON column.""" + s1 = bpd.Series([1, 2, 3, 4]) + s2 = bpd.Series( + ["a", 1, False, ["a", {"b": 1}], {"c": [1, 2, 3]}], dtype=db_dtypes.JSONDtype() + ) + + df = bpd.DataFrame({"id": s1, "json_col": s2}) + destination_table = df.to_gbq() + table = bigquery_client.get_table(destination_table) + + assert table.schema[1].name == "json_col" + assert table.schema[1].field_type == "JSON" + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index cdda7c753d..00f47c754e 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -237,7 +237,7 @@ def test_series_construct_geodata(): pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), ], ) -def test_series_construct_w_dtype_for_int(dtype): +def test_series_construct_w_dtype(dtype): data = [1, 2, 3] expected = pd.Series(data, dtype=dtype) expected.index = expected.index.astype("Int64") @@ -302,6 +302,26 @@ def test_series_construct_w_dtype_for_array_struct(): ) +def test_series_construct_w_dtype_for_json(): + data = [ + 1, + "str", + False, + ["a", {"b": 1}, None], + None, + {"a": {"b": [1, 2, 3], "c": True}}, + ] + s = bigframes.pandas.Series(data, dtype=db_dtypes.JSONDtype()) + + assert s[0] == 1 + assert s[1] == "str" + assert s[2] is False + assert s[3][0] == "a" + assert s[3][1]["b"] == 1 + assert pd.isna(s[4]) + assert s[5]["a"] == {"b": [1, 2, 3], "c": True} + + def test_series_keys(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].keys().to_pandas() @@ -1636,6 +1656,27 @@ def test_series_binop_w_other_types(scalars_dfs, other): ) +@pytest.mark.parametrize( + ("other",), + [ + ([-1.4, 2.3, None],), + (pd.Index([-1.4, 2.3, None]),), + (pd.Series([-1.4, 2.3, None], index=[44, 2, 1]),), + ], +) +@skip_legacy_pandas +def test_series_reverse_binop_w_other_types(scalars_dfs, other): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (other + scalars_df["int64_col"].head(3)).to_pandas() + pd_result = other + scalars_pandas_df["int64_col"].head(3) + + assert_series_equal( + bf_result, + pd_result, + ) + + @skip_legacy_pandas def test_series_combine_first(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index a4acb72117..0c8da52774 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,6 +22,7 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq +import db_dtypes # type: ignore import google import google.cloud.bigquery as bigquery import numpy as np @@ -747,6 +748,77 @@ def test_read_pandas_timedelta_index(session, write_engine): pd.testing.assert_index_equal(actual_result, expected_index) +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + pytest.param("bigquery_streaming"), + pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)), + ], +) +def test_read_pandas_json_dataframes(session, write_engine): + json_data = [ + 1, + None, + ["1", "3", "5"], + {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + ] + expected_df = pd.DataFrame( + {"my_col": pd.Series(json_data, dtype=db_dtypes.JSONDtype())} + ) + + actual_result = session.read_pandas( + expected_df, write_engine=write_engine + ).to_pandas() + + if write_engine == "bigquery_streaming": + expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64") + pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) + + +@pytest.mark.parametrize( + "write_engine", + ["default", "bigquery_load"], +) +def test_read_pandas_json_series(session, write_engine): + json_data = [ + 1, + None, + ["1", "3", "5"], + {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + ] + expected_series = pd.Series(json_data, dtype=db_dtypes.JSONDtype()) + + actual_result = session.read_pandas( + expected_series, write_engine=write_engine + ).to_pandas() + pd.testing.assert_series_equal( + actual_result, expected_series, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_load"), + ], +) +def test_read_pandas_json_index(session, write_engine): + json_data = [ + 1, + None, + ["1", "3", "5"], + {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + ] + expected_index = pd.Index(json_data, dtype=db_dtypes.JSONDtype()) + actual_result = session.read_pandas( + expected_index, write_engine=write_engine + ).to_pandas() + pd.testing.assert_index_equal(actual_result, expected_index) + + @utils.skip_legacy_pandas @pytest.mark.parametrize( ("write_engine",), diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index f3b747219b..a0260394b9 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -18,7 +18,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var1 = date(1995, 9, 1) var2 = date(1995, 10, 1) - merged = lineitem.merge(part, left_on="L_PARTKEY", right_on="P_PARTKEY") + merged = part.merge(lineitem, left_on="P_PARTKEY", right_on="L_PARTKEY") filtered = merged[(merged["L_SHIPDATE"] >= var1) & (merged["L_SHIPDATE"] < var2)] diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index a02dcef5dc..f55939b03c 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -20,22 +20,16 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var1 = "Brand#45" - supplier = ( - supplier[ - ~supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) - ]["S_SUPPKEY"] - .unique(keep_order=False) - .to_frame() - ) + supplier = supplier[ + ~supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) + ]["S_SUPPKEY"] q_filtered = part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") q_filtered = q_filtered[q_filtered["P_BRAND"] != var1] q_filtered = q_filtered[~q_filtered["P_TYPE"].str.contains("MEDIUM POLISHED")] q_filtered = q_filtered[q_filtered["P_SIZE"].isin([49, 14, 23, 45, 19, 3, 36, 9])] - final_df = q_filtered.merge( - supplier, left_on=["PS_SUPPKEY"], right_on=["S_SUPPKEY"] - ) + final_df = q_filtered[q_filtered["PS_SUPPKEY"].isin(supplier)] grouped = final_df.groupby(["P_BRAND", "P_TYPE", "P_SIZE"], as_index=False) result = grouped.agg( diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py index e6a87dc482..aa7f743602 100644 --- a/third_party/bigframes_vendored/tpch/queries/q17.py +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -19,7 +19,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): filtered_part = part[(part["P_BRAND"] == VAR1) & (part["P_CONTAINER"] == VAR2)] q1 = bpd.merge( - filtered_part, lineitem, how="left", left_on="P_PARTKEY", right_on="L_PARTKEY" + lineitem, filtered_part, how="right", left_on="L_PARTKEY", right_on="P_PARTKEY" ) grouped = ( diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py index c6802e6808..576ce58d5c 100644 --- a/third_party/bigframes_vendored/tpch/queries/q18.py +++ b/third_party/bigframes_vendored/tpch/queries/q18.py @@ -22,14 +22,13 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var1 = 300 + # order with over 300 items q1 = lineitem.groupby("L_ORDERKEY", as_index=False).agg( SUM_QUANTITY=bpd.NamedAgg(column="L_QUANTITY", aggfunc="sum") ) q1 = q1[q1["SUM_QUANTITY"] > var1] - filtered_orders = orders.merge( - q1, left_on="O_ORDERKEY", right_on="L_ORDERKEY", how="inner" - ) + filtered_orders = orders[orders["O_ORDERKEY"].isin(q1["L_ORDERKEY"])] result = filtered_orders.merge( lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY" diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index 5c2d8d391f..7c434eba03 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -44,19 +44,19 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): filtered_parts = part[part["P_NAME"].str.startswith(var4)] - filtered_parts = filtered_parts["P_PARTKEY"].unique(keep_order=False).to_frame() - joined_parts = filtered_parts.merge( - partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY" - ) + filtered_parts = filtered_parts["P_PARTKEY"] + joined_parts = partsupp[partsupp["PS_PARTKEY"].isin(filtered_parts)] - final_join = joined_parts.merge( - q1, left_on=["PS_SUPPKEY", "P_PARTKEY"], right_on=["L_SUPPKEY", "L_PARTKEY"] + final_join = q1.merge( + joined_parts, + left_on=["L_SUPPKEY", "L_PARTKEY"], + right_on=["PS_SUPPKEY", "PS_PARTKEY"], ) - final_filtered = final_join[final_join["PS_AVAILQTY"] > final_join["SUM_QUANTITY"]] - - final_filtered = final_filtered["PS_SUPPKEY"].unique(keep_order=False).to_frame() + final_filtered = final_join[final_join["PS_AVAILQTY"] > final_join["SUM_QUANTITY"]][ + "PS_SUPPKEY" + ] - final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") + final_result = q3[q3["S_SUPPKEY"].isin(final_filtered)] final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME") next(final_result.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index 153ef63c5d..a8d147eae4 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -15,37 +15,22 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) country_codes = ["13", "31", "23", "29", "30", "18", "17"] - customer["CNTRYCODE"] = customer["C_PHONE"].str.slice(0, 2) + customer = customer[customer["CNTRYCODE"].isin(country_codes)] avg_acctbal = ( - customer[ - (customer["CNTRYCODE"].isin(country_codes)) & (customer["C_ACCTBAL"] > 0) - ][["C_ACCTBAL"]] + customer[customer["C_ACCTBAL"] > 0.0][["C_ACCTBAL"]] .mean() .rename("AVG_ACCTBAL") ) - - orders_unique = orders["O_CUSTKEY"].unique(keep_order=False).to_frame() - - matched_customers = customer.merge( - orders_unique, left_on="C_CUSTKEY", right_on="O_CUSTKEY" - ) - matched_customers["IS_IN_ORDERS"] = True - - customer = customer.merge( - matched_customers[["C_CUSTKEY", "IS_IN_ORDERS"]], on="C_CUSTKEY", how="left" - ) - customer["IS_IN_ORDERS"] = customer["IS_IN_ORDERS"].fillna(False) customer = customer.merge(avg_acctbal, how="cross") - filtered_customers = customer[ - (customer["CNTRYCODE"].isin(country_codes)) - & (customer["C_ACCTBAL"] > customer["AVG_ACCTBAL"]) - & (~customer["IS_IN_ORDERS"]) - ] + filtered_customer = customer[customer["C_ACCTBAL"] > customer["AVG_ACCTBAL"]] - result = filtered_customers.groupby("CNTRYCODE", as_index=False).agg( + filtered_customer = filtered_customer[ + ~filtered_customer["C_CUSTKEY"].isin(orders["O_CUSTKEY"]) + ] + result = filtered_customer.groupby("CNTRYCODE", as_index=False).agg( NUMCUST=bpd.NamedAgg(column="C_CUSTKEY", aggfunc="count"), TOTACCTBAL=bpd.NamedAgg(column="C_ACCTBAL", aggfunc="sum"), ) diff --git a/third_party/bigframes_vendored/tpch/queries/q3.py b/third_party/bigframes_vendored/tpch/queries/q3.py index 60d181a603..5a43f5fff7 100644 --- a/third_party/bigframes_vendored/tpch/queries/q3.py +++ b/third_party/bigframes_vendored/tpch/queries/q3.py @@ -23,11 +23,13 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): fcustomer = customer[customer["C_MKTSEGMENT"] == "BUILDING"] - jn1 = fcustomer.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") - jn2 = jn1.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") + filtered_orders = orders[orders["O_ORDERDATE"] < date_var] + filtered_lineitem = lineitem[lineitem["L_SHIPDATE"] > date_var] - jn2 = jn2[jn2["O_ORDERDATE"] < date_var] - jn2 = jn2[jn2["L_SHIPDATE"] > date_var] + jn1 = filtered_lineitem.merge( + filtered_orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY" + ) + jn2 = fcustomer.merge(jn1, left_on="C_CUSTKEY", right_on="O_CUSTKEY") jn2["REVENUE"] = jn2["L_EXTENDEDPRICE"] * (1 - jn2["L_DISCOUNT"]) gb = jn2.groupby(["O_ORDERKEY", "O_ORDERDATE", "O_SHIPPRIORITY"], as_index=False) diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py index 406df79a5a..1361c40901 100644 --- a/third_party/bigframes_vendored/tpch/queries/q5.py +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -35,20 +35,20 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var2 = date(1994, 1, 1) var3 = date(1995, 1, 1) + region = region[region["R_NAME"] == var1] + orders = orders[(orders["O_ORDERDATE"] >= var2) & (orders["O_ORDERDATE"] < var3)] + lineitem["REVENUE"] = lineitem["L_EXTENDEDPRICE"] * (1.0 - lineitem["L_DISCOUNT"]) + jn1 = region.merge(nation, left_on="R_REGIONKEY", right_on="N_REGIONKEY") jn2 = jn1.merge(customer, left_on="N_NATIONKEY", right_on="C_NATIONKEY") - jn3 = jn2.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") - jn4 = jn3.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") + jn3 = orders.merge(jn2, left_on="O_CUSTKEY", right_on="C_CUSTKEY") + jn4 = lineitem.merge(jn3, left_on="L_ORDERKEY", right_on="O_ORDERKEY") jn5 = jn4.merge( supplier, left_on=["L_SUPPKEY", "N_NATIONKEY"], right_on=["S_SUPPKEY", "S_NATIONKEY"], ) - jn5 = jn5[jn5["R_NAME"] == var1] - jn5 = jn5[(jn5["O_ORDERDATE"] >= var2) & (jn5["O_ORDERDATE"] < var3)] - jn5["REVENUE"] = jn5["L_EXTENDEDPRICE"] * (1.0 - jn5["L_DISCOUNT"]) - gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum() result_df = gb.sort_values("REVENUE", ascending=False) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index e92072bea8..27dfb23603 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.36.0" +__version__ = "1.37.0"