diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index ae437bcb2f..6301519a9a 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:5efdf8d38e5a22c1ec9e5541cbdfde56399bdffcb6f531183f84ac66052a8024 -# created: 2024-10-23T18:04:53.195998718Z + digest: sha256:2ed982f884312e4883e01b5ab8af8b6935f0216a5a2d82928d273081fc3be562 +# created: 2024-11-12T12:09:45.821174897Z diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt index 66eacc82f0..8bb0764594 100644 --- a/.kokoro/docker/docs/requirements.txt +++ b/.kokoro/docker/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --allow-unsafe --generate-hashes requirements.in @@ -8,9 +8,9 @@ argcomplete==3.5.1 \ --hash=sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363 \ --hash=sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4 # via nox -colorlog==6.8.2 \ - --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ - --hash=sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33 +colorlog==6.9.0 \ + --hash=sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff \ + --hash=sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2 # via nox distlib==0.3.9 \ --hash=sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 \ @@ -24,9 +24,9 @@ nox==2024.10.9 \ --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 # via -r requirements.in -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 +packaging==24.2 \ + --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \ + --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f # via nox platformdirs==4.3.6 \ --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ @@ -36,7 +36,7 @@ tomli==2.0.2 \ --hash=sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38 \ --hash=sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed # via nox -virtualenv==20.26.6 \ - --hash=sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48 \ - --hash=sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2 +virtualenv==20.27.1 \ + --hash=sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba \ + --hash=sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4 # via nox diff --git a/.kokoro/test-samples-impl.sh b/.kokoro/test-samples-impl.sh index 55910c8ba1..53e365bc4e 100755 --- a/.kokoro/test-samples-impl.sh +++ b/.kokoro/test-samples-impl.sh @@ -33,7 +33,8 @@ export PYTHONUNBUFFERED=1 env | grep KOKORO # Install nox -python3.9 -m pip install --upgrade --quiet nox +# `virtualenv==20.26.6` is added for Python 3.7 compatibility +python3.9 -m pip install --upgrade --quiet nox virtualenv==20.26.6 # Use secrets acessor service account to get secrets if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 1df47b2afc..897e830961 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,25 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.27.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.26.0...v1.27.0) (2024-11-16) + + +### Features + +* Add astype(type, errors='null') to cast safely ([#1122](https://github.com/googleapis/python-bigquery-dataframes/issues/1122)) ([b4d17ff](https://github.com/googleapis/python-bigquery-dataframes/commit/b4d17ffdd891da266ad9765a087d3512c0e056fc)) + + +### Bug Fixes + +* Dataframe fillna with scalar. ([#1132](https://github.com/googleapis/python-bigquery-dataframes/issues/1132)) ([37f8c32](https://github.com/googleapis/python-bigquery-dataframes/commit/37f8c32a541565208602f3f6ed37dded13e16b9b)) +* Exclude index columns from model fitting processes. ([#1138](https://github.com/googleapis/python-bigquery-dataframes/issues/1138)) ([8d4da15](https://github.com/googleapis/python-bigquery-dataframes/commit/8d4da1582a5965e6a1f9732ec0ce592ea47ce5fa)) +* Unordered mode too many labels issue. ([#1148](https://github.com/googleapis/python-bigquery-dataframes/issues/1148)) ([7216b21](https://github.com/googleapis/python-bigquery-dataframes/commit/7216b21abd01bc61878bb5686f83ee13ef297912)) + + +### Documentation + +* Document groupby.head and groupby.size methods ([#1111](https://github.com/googleapis/python-bigquery-dataframes/issues/1111)) ([a61eb4d](https://github.com/googleapis/python-bigquery-dataframes/commit/a61eb4d6e323e5001715d402e0e67054df6e62af)) + ## [1.26.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.25.0...v1.26.0) (2024-11-12) diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index c39502eade..b0e1cbdd18 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -22,6 +22,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators = False + self._blob = False @property def semantic_operators(self) -> bool: @@ -34,3 +35,15 @@ def semantic_operators(self, value: bool): "Semantic operators are still under experiments, and are subject to change in the future." ) self._semantic_operators = value + + @property + def blob(self) -> bool: + return self._blob + + @blob.setter + def blob(self, value: bool): + if value is True: + warnings.warn( + "BigFrames Blob is still under experiments. It may not work and subject to change in the future." + ) + self._blob = value diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2648c9993f..4fc663817c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2426,14 +2426,12 @@ def to_sql_query( def cached(self, *, force: bool = False, session_aware: bool = False) -> None: """Write the block to a session table.""" # use a heuristic for whether something needs to be cached - if (not force) and self.session._executor._is_trivially_executable(self.expr): - return - elif session_aware: - self.session._executor._cache_with_session_awareness(self.expr) - else: - self.session._executor._cache_with_cluster_cols( - self.expr, cluster_cols=self.index_columns - ) + self.session._executor.cached( + self.expr, + force=force, + use_session=session_aware, + cluster_cols=self.index_columns, + ) def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index a4c37b7c5d..5abf97a78d 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -89,7 +89,7 @@ def cast_ibis_value( - value: ibis_types.Value, to_type: ibis_dtypes.DataType + value: ibis_types.Value, to_type: ibis_dtypes.DataType, safe: bool = False ) -> ibis_types.Value: """Perform compatible type casts of ibis values @@ -176,7 +176,7 @@ def cast_ibis_value( value = ibis_value_to_canonical_type(value) if value.type() in good_casts: if to_type in good_casts[value.type()]: - return value.cast(to_type) + return value.try_cast(to_type) if safe else value.cast(to_type) else: # this should never happen raise TypeError( @@ -188,10 +188,16 @@ def cast_ibis_value( # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas # TODO(bmil): remove this workaround after fixing Ibis if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string: - return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() + if safe: + return cast(ibis_types.StringValue, value.try_cast(to_type)).capitalize() + else: + return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: - return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) + if safe: + return value.try_cast(ibis_dtypes.int64).try_cast(ibis_dtypes.float64) + else: + return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: return value != ibis_types.literal(0) diff --git a/bigframes/core/compile/polars/__init__.py b/bigframes/core/compile/polars/__init__.py new file mode 100644 index 0000000000..e15f229faf --- /dev/null +++ b/bigframes/core/compile/polars/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings + +try: + import polars # noqa + + from bigframes.core.compile.polars.compiler import PolarsCompiler + + __all__ = ["PolarsCompiler"] +except Exception: + warnings.warn("Polars compiler not available as polars is not installed.") diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py new file mode 100644 index 0000000000..d1ae063b59 --- /dev/null +++ b/bigframes/core/compile/polars/compiler.py @@ -0,0 +1,379 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +import functools +import itertools +from typing import cast, Sequence, TYPE_CHECKING + +import bigframes.core +import bigframes.core.expression as ex +import bigframes.core.guid as guid +import bigframes.core.nodes as nodes +import bigframes.core.rewrite +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +polars_installed = True +if TYPE_CHECKING: + import polars as pl +else: + try: + import polars as pl + except Exception: + polars_installed = False + +if polars_installed: + + @dataclasses.dataclass(frozen=True) + class PolarsExpressionCompiler: + """ + Simple compiler for converting bigframes expressions to polars expressions. + + Should be extended to dispatch based on bigframes schema types. + """ + + @functools.singledispatchmethod + def compile_expression(self, expression: ex.Expression): + raise NotImplementedError(f"Cannot compile expression: {expression}") + + @compile_expression.register + def _( + self, + expression: ex.ScalarConstantExpression, + ): + return pl.lit(expression.value) + + @compile_expression.register + def _( + self, + expression: ex.DerefOp, + ): + return pl.col(expression.id.sql) + + @compile_expression.register + def _( + self, + expression: ex.OpExpression, + ): + # TODO: Complete the implementation, convert to hash dispatch + op = expression.op + args = tuple(map(self.compile_expression, expression.inputs)) + if isinstance(op, ops.invert_op.__class__): + return args[0].neg() + if isinstance(op, ops.and_op.__class__): + return args[0] & args[1] + if isinstance(op, ops.or_op.__class__): + return args[0] | args[1] + if isinstance(op, ops.add_op.__class__): + return args[0] + args[1] + if isinstance(op, ops.sub_op.__class__): + return args[0] - args[1] + if isinstance(op, ops.ge_op.__class__): + return args[0] >= args[1] + if isinstance(op, ops.gt_op.__class__): + return args[0] > args[1] + if isinstance(op, ops.le_op.__class__): + return args[0] <= args[1] + if isinstance(op, ops.lt_op.__class__): + return args[0] < args[1] + if isinstance(op, ops.eq_op.__class__): + return args[0] == args[1] + if isinstance(op, ops.mod_op.__class__): + return args[0] % args[1] + if isinstance(op, ops.coalesce_op.__class__): + return pl.coalesce(*args) + if isinstance(op, ops.CaseWhenOp): + expr = pl.when(args[0]).then(args[1]) + for pred, result in zip(args[2::2], args[3::2]): + return expr.when(pred).then(result) + return expr + raise NotImplementedError(f"Polars compiler hasn't implemented {op}") + + @dataclasses.dataclass(frozen=True) + class PolarsAggregateCompiler: + scalar_compiler = PolarsExpressionCompiler() + + def get_args( + self, + agg: ex.Aggregation, + ) -> Sequence[pl.Expr]: + """Prepares arguments for aggregation by compiling them.""" + if isinstance(agg, ex.NullaryAggregation): + return [] + elif isinstance(agg, ex.UnaryAggregation): + arg = self.scalar_compiler.compile_expression(agg.arg) + return [arg] + elif isinstance(agg, ex.BinaryAggregation): + larg = self.scalar_compiler.compile_expression(agg.left) + rarg = self.scalar_compiler.compile_expression(agg.right) + return [larg, rarg] + + raise NotImplementedError( + f"Aggregation {agg} not yet supported in polars engine." + ) + + def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): + if isinstance(op, agg_ops.ProductOp): + # TODO: Need schema to cast back to original type if posisble (eg float back to int) + return pl.col(*inputs).log().sum().exp() + if isinstance(op, agg_ops.SumOp): + return pl.sum(*inputs) + if isinstance(op, agg_ops.MinOp): + return pl.min(*inputs) + if isinstance(op, agg_ops.MaxOp): + return pl.max(*inputs) + if isinstance(op, agg_ops.CountOp): + return pl.count(*inputs) + if isinstance(op, agg_ops.CorrOp): + return pl.corr(*inputs) + raise NotImplementedError( + f"Aggregate op {op} not yet supported in polars engine." + ) + + +@dataclasses.dataclass(frozen=True) +class PolarsCompiler: + """ + Compiles ArrayValue to polars LazyFrame and executes. + + This feature is in development and is incomplete. + While most node types are supported, this has the following limitations: + 1. GBQ data sources not supported. + 2. Joins do not order rows correctly + 3. Incomplete scalar op support + 4. Incomplete aggregate op support + 5. Incomplete analytic op support + 6. Some complex windowing types not supported (eg. groupby + rolling) + 7. UDFs are not supported. + 8. Returned types may not be entirely consistent with BigQuery backend + 9. Some operations are not entirely lazy - sampling and somse windowing. + """ + + expr_compiler = PolarsExpressionCompiler() + agg_compiler = PolarsAggregateCompiler() + + def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: + if not polars_installed: + raise ValueError( + "Polars is not installed, cannot compile to polars engine." + ) + + # TODO: Create standard way to configure BFET -> BFET rewrites + # Polars has incomplete slice support in lazy mode + node = bigframes.core.rewrite.replace_slice_ops(array_value.node) + return self.compile_node(node) + + @functools.singledispatchmethod + def compile_node(self, node: nodes.BigFrameNode): + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode): + cols_to_read = { + scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items + } + return ( + pl.read_ipc(node.feather_bytes, columns=list(cols_to_read.keys())) + .lazy() + .rename(cols_to_read) + ) + + @compile_node.register + def compile_filter(self, node: nodes.FilterNode): + return self.compile_node(node.child).filter( + self.expr_compiler.compile_expression(node.predicate) + ) + + @compile_node.register + def compile_orderby(self, node: nodes.OrderByNode): + frame = self.compile_node(node.child) + if len(node.by) == 0: + # pragma: no cover + return frame + + frame = frame.sort( + [ + self.expr_compiler.compile_expression(by.scalar_expression) + for by in node.by + ], + descending=[not by.direction.is_ascending for by in node.by], + nulls_last=[by.na_last for by in node.by], + maintain_order=True, + ) + return frame + + @compile_node.register + def compile_reversed(self, node: nodes.ReversedNode): + return self.compile_node(node.child).reverse() + + @compile_node.register + def compile_selection(self, node: nodes.SelectionNode): + return self.compile_node(node.child).select( + **{new.sql: orig.id.sql for orig, new in node.input_output_pairs} + ) + + @compile_node.register + def compile_projection(self, node: nodes.ProjectionNode): + new_cols = [ + self.expr_compiler.compile_expression(ex).alias(name.sql) + for ex, name in node.assignments + ] + return self.compile_node(node.child).with_columns(new_cols) + + @compile_node.register + def compile_rowcount(self, node: nodes.RowCountNode): + df = cast(pl.LazyFrame, self.compile_node(node.child)) + return df.select(pl.len().alias(node.col_id.sql)) + + @compile_node.register + def compile_offsets(self, node: nodes.PromoteOffsetsNode): + return self.compile_node(node.child).with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.col_id.sql)] + ) + + @compile_node.register + def compile_join(self, node: nodes.JoinNode): + # Always totally order this, as adding offsets is relatively cheap for in-memory columnar data + left = self.compile_node(node.left_child).with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_l"), + ] + ) + right = self.compile_node(node.right_child).with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_r"), + ] + ) + if node.type != "cross": + left_on = [l_name.id.sql for l_name, _ in node.conditions] + right_on = [r_name.id.sql for _, r_name in node.conditions] + joined = left.join( + right, how=node.type, left_on=left_on, right_on=right_on, coalesce=False + ) + else: + joined = left.join(right, how=node.type) + return joined.sort(["_bf_join_l", "_bf_join_r"]).drop( + ["_bf_join_l", "_bf_join_r"] + ) + + @compile_node.register + def compile_concat(self, node: nodes.ConcatNode): + return pl.concat(self.compile_node(child) for child in node.child_nodes) + + @compile_node.register + def compile_agg(self, node: nodes.AggregateNode): + df = self.compile_node(node.child) + + # Need to materialize columns to broadcast constants + agg_inputs = [ + list( + map( + lambda x: x.alias(guid.generate_guid()), + self.agg_compiler.get_args(agg), + ) + ) + for agg, _ in node.aggregations + ] + + df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) + + agg_exprs = [ + self.agg_compiler.compile_agg_op( + agg.op, list(map(lambda x: x.meta.output_name(), inputs)) + ).alias(id.sql) + for (agg, id), inputs in zip(node.aggregations, agg_inputs) + ] + + if len(node.by_column_ids) > 0: + group_exprs = [pl.col(ref.id.sql) for ref in node.by_column_ids] + grouped_df = df_agg_inputs.group_by(group_exprs) + return grouped_df.agg(agg_exprs).sort(group_exprs) + else: + return df_agg_inputs.select(agg_exprs) + + @compile_node.register + def compile_explode(self, node: nodes.ExplodeNode): + df = self.compile_node(node.child) + cols = [pl.col(col.id.sql) for col in node.column_ids] + return df.explode(cols) + + @compile_node.register + def compile_sample(self, node: nodes.RandomSampleNode): + df = self.compile_node(node.child) + # Sample is not available on lazyframe + return df.collect().sample(fraction=node.fraction).lazy() + + @compile_node.register + def compile_window(self, node: nodes.WindowOpNode): + df = self.compile_node(node.child) + agg_expr = self.agg_compiler.compile_agg_op( + node.op, [node.column_name.id.sql] + ).alias(node.output_name.sql) + # Three window types: completely unbound, grouped and row bounded + + window = node.window_spec + + if window.min_periods > 0: + raise NotImplementedError("min_period not yet supported for polars engine") + + if window.bounds is None: + # polars will automatically broadcast the aggregate to the matching input rows + if len(window.grouping_keys) == 0: # unbound window + pass + else: # partition-only window + agg_expr = agg_expr.over( + partition_by=[ref.id.sql for ref in window.grouping_keys] + ) + return df.with_columns([agg_expr]) + + else: # row-bounded window + # Polars API semi-bounded, and any grouped rolling window challenging + # https://github.com/pola-rs/polars/issues/4799 + # https://github.com/pola-rs/polars/issues/8976 + index_col_name = "_bf_pl_engine_offsets" + indexed_df = df.with_row_index(index_col_name) + if len(window.grouping_keys) == 0: # rolling-only window + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html + finite = ( + window.bounds.preceding is not None + and window.bounds.following is not None + ) + offset_n = ( + None + if window.bounds.preceding is None + else -window.bounds.preceding + ) + # collecting height is a massive kludge + period_n = ( + df.collect().height + if not finite + else cast(int, window.bounds.preceding) + + cast(int, window.bounds.following) + + 1 + ) + results = indexed_df.rolling( + index_column=index_col_name, + period=f"{period_n}i", + offset=f"{offset_n}i" if offset_n else None, + ).agg(agg_expr) + else: # groupby-rolling window + raise NotImplementedError( + "Groupby rolling windows not yet implemented in polars engine" + ) + # polars is columnar, so this is efficient + # TODO: why can't just add columns? + return pl.concat([df, results], how="horizontal") diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 80e354aa8c..e7526ca48b 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -947,7 +947,9 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return result.cast(result.type()(nullable=True)).name(name) -def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: +def numeric_to_datetime( + x: ibis_types.Value, unit: str, safe: bool = False +) -> ibis_types.TimestampValue: if not isinstance(x, ibis_types.IntegerValue) and not isinstance( x, ibis_types.FloatingValue ): @@ -956,7 +958,11 @@ def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampV if unit not in UNIT_TO_US_CONVERSION_FACTORS: raise ValueError(f"Cannot convert input with unit '{unit}'.") x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] - x_converted = x_converted.cast(ibis_dtypes.int64) + x_converted = ( + x_converted.try_cast(ibis_dtypes.int64) + if safe + else x_converted.cast(ibis_dtypes.int64) + ) # Note: Due to an issue where casting directly to a timestamp # without a timezone does not work, we first cast to UTC. This @@ -978,8 +984,11 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: - x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC")) - return bigframes.core.compile.ibis_types.cast_ibis_value(x_converted, to_type) + utc_time_type = ibis_dtypes.Timestamp(timezone="UTC") + x_converted = x.try_cast(utc_time_type) if op.safe else x.cast(utc_time_type) + return bigframes.core.compile.ibis_types.cast_ibis_value( + x_converted, to_type, safe=op.safe + ) if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: # The conversion unit is set to "us" (microseconds) for consistency @@ -991,15 +1000,20 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. unit = "us" - x_converted = numeric_to_datetime(x, unit) + x_converted = numeric_to_datetime(x, unit, safe=op.safe) if to_type == ibis_dtypes.timestamp: - return x_converted.cast(ibis_dtypes.Timestamp()) + return ( + x_converted.try_cast(ibis_dtypes.Timestamp()) + if op.safe + else x_converted.cast(ibis_dtypes.Timestamp()) + ) elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): return x_converted elif to_type == ibis_dtypes.time: return x_converted.time() - return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type) + # TODO: either inline this function, or push rest of this op into the function + return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0ba79bebee..a8445835dd 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -17,7 +17,7 @@ from __future__ import annotations import typing -from typing import Hashable, Optional, Sequence, Union +from typing import Hashable, Literal, Optional, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index @@ -324,11 +324,17 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> Index: + if errors not in ["raise", "null"]: + raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") return self._apply_unary_expr( - ops.AsTypeOp(to_type=dtype).as_expr(ex.free_var("arg")) + ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr( + ex.free_var("arg") + ) ) def all(self) -> bool: diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index ac658d1bb8..573562cefa 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -59,5 +59,8 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_time64(type): # This is potentially lossy, but BigFrames doesn't support ns return pa.time64("us") + if pa.types.is_large_string(type): + # simple string type can handle the largest strings needed + return pa.string() else: return type diff --git a/bigframes/core/slices.py b/bigframes/core/slices.py index 97f90d3349..68ec79f9fb 100644 --- a/bigframes/core/slices.py +++ b/bigframes/core/slices.py @@ -30,13 +30,17 @@ def to_forward_offsets( start = 0 if (step > 0) else (input_rows - 1) elif start < 0: start = max(0, input_rows + start) - else: - start = min(start, input_rows) + else: # start >= 0 + # Clip start to either beginning or end depending on step direction + start = min(start, input_rows - 1) if step < 0 else start if stop is None: stop = None elif stop < 0: - stop = max(0, input_rows + stop) + if step > 0: + stop = max(0, input_rows + stop) + else: + stop = input_rows + stop if (input_rows + stop >= 0) else None else: stop = min(stop, input_rows) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f803b66ab6..0b639a5649 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -365,8 +365,14 @@ def __iter__(self): def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> DataFrame: - return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) + if errors not in ["raise", "null"]: + raise ValueError("Arg 'error' must be one of 'raise' or 'null'") + return self._apply_unary_op( + ops.AsTypeOp(to_type=dtype, safe=(errors == "null")) + ) def _to_sql_query( self, include_index: bool, enable_cache: bool = True @@ -734,7 +740,7 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int, bool)): + if isinstance(other, bigframes.dtypes.LOCAL_SCALAR_TYPES): return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) @@ -752,7 +758,10 @@ def _apply_binop( ) def _apply_scalar_binop( - self, other: float | int, op: ops.BinaryOp, reverse: bool = False + self, + other: bigframes.dtypes.LOCAL_SCALAR_TYPE, + op: ops.BinaryOp, + reverse: bool = False, ) -> DataFrame: if reverse: expr = op.as_expr( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index bc5b89b779..c71531f9f3 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -59,6 +59,25 @@ # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE +LOCAL_SCALAR_TYPE = Union[ + bool, + np.bool_, + int, + np.integer, + float, + np.floating, + decimal.Decimal, + str, + np.str_, + bytes, + np.bytes_, + datetime.datetime, + pd.Timestamp, + datetime.date, + datetime.time, +] +LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) + # Will have a few dtype variants: simple(eg. int, string, bool), complex (eg. list, struct), and virtual (eg. micro intervals, categorical) @dataclass(frozen=True) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 27d9bfb4f4..5b442772ce 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -68,7 +68,8 @@ class SQLScalarColumnTransformer: >>> from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer >>> import bigframes.pandas as bpd - + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'name': ["James", None, "Mary"], 'city': ["New York", "Boston", None]}) >>> col_trans = ColumnTransformer([ ... ("strlen", diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4bc61c5015..be67396fba 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -307,9 +307,11 @@ def create_model( # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot if y_train is None: - input_data = X_train.cache() + input_data = X_train.reset_index(drop=True).cache() else: - input_data = X_train.join(y_train, how="outer").cache() + input_data = ( + X_train.join(y_train, how="outer").reset_index(drop=True).cache() + ) options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 2e2e4a0552..fa38be368f 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -494,6 +494,7 @@ class AsTypeOp(UnaryOp): name: typing.ClassVar[str] = "astype" # TODO: Convert strings to dtype earlier to_type: dtypes.DtypeString | dtypes.Dtype + safe: bool = False def output_type(self, *input_types): # TODO: We should do this conversion earlier diff --git a/bigframes/series.py b/bigframes/series.py index 1d44cdd963..92857d726d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -352,8 +352,14 @@ def __repr__(self) -> str: def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> Series: - return self._apply_unary_op(bigframes.operations.AsTypeOp(to_type=dtype)) + if errors not in ["raise", "null"]: + raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") + return self._apply_unary_op( + bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) + ) def to_pandas( self, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 1c8d497974..21789835e7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -27,7 +27,6 @@ IO, Iterable, Literal, - Mapping, MutableSequence, Optional, Sequence, @@ -59,7 +58,6 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients -import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile import bigframes.core.guid @@ -258,12 +256,14 @@ def __init__( kms_key=self._bq_kms_key_name, ) ) - self._executor = bigframes.session.executor.BigQueryCachingExecutor( - bqclient=self._clients_provider.bqclient, - bqstoragereadclient=self._clients_provider.bqstoragereadclient, - storage_manager=self._temp_storage_manager, - strictly_ordered=self._strictly_ordered, - metrics=self._metrics, + self._executor: bigframes.session.executor.Executor = ( + bigframes.session.executor.BigQueryCachingExecutor( + bqclient=self._clients_provider.bqclient, + bqstoragereadclient=self._clients_provider.bqstoragereadclient, + storage_manager=self._temp_storage_manager, + strictly_ordered=self._strictly_ordered, + metrics=self._metrics, + ) ) self._loader = bigframes.session.loader.GbqDataLoader( session=self, @@ -946,6 +946,14 @@ def read_parquet( path, table, job_config=job_config ) else: + if "*" in path: + raise ValueError( + "The provided path contains a wildcard character (*), which is not " + "supported by the current engine. To read files from wildcard paths, " + "please use the 'bigquery' engine by setting `engine='bigquery'` in " + "your configuration." + ) + read_parquet_kwargs: Dict[str, Any] = {} if pandas.__version__.startswith("1."): read_parquet_kwargs["use_nullable_dtypes"] = True @@ -1415,24 +1423,6 @@ def _start_query_ml_ddl( self.bqclient, sql, job_config, metrics=self._metrics ) - def _export( - self, - array_value: core.ArrayValue, - destination: bigquery.TableReference, - *, - if_exists: Literal["fail", "replace", "append"] = "fail", - col_id_overrides: Mapping[str, str] = {}, - cluster_cols: Sequence[str], - ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - # Note: cluster_cols use pre-override column ids - return self._executor.export_gbq( - array_value, - destination=destination, - col_id_overrides=col_id_overrides, - if_exists=if_exists, - cluster_cols=cluster_cols, - ) - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 6ceaab6915..be8f3a8537 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -24,6 +24,7 @@ import pyarrow.types # type: ignore import bigframes.core.schema +import bigframes.dtypes import bigframes.features @@ -102,6 +103,12 @@ def arrow_to_pandas( else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) + elif dtype == bigframes.dtypes.STRING_DTYPE: + # Pyarrow may be large_string + # Need to manually cast, as some pandas versions break otherwise + series = column.cast(pyarrow.string()).to_pandas( + types_mapper=lambda _: dtype + ) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) else: diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 170f0ac086..d19ec23501 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -14,6 +14,7 @@ from __future__ import annotations +import abc import dataclasses import math import os @@ -80,7 +81,111 @@ def to_arrow_table(self) -> pyarrow.Table: ) -class BigQueryCachingExecutor: +class Executor(abc.ABC): + """ + Interface for an executor, which compiles and executes ArrayValue objects. + """ + + def to_sql( + self, + array_value: bigframes.core.ArrayValue, + offset_column: Optional[str] = None, + col_id_overrides: Mapping[str, str] = {}, + ordered: bool = False, + enable_cache: bool = True, + ) -> str: + """ + Convert an ArrayValue to a sql query that will yield its value. + """ + raise NotImplementedError("to_sql not implemented for this executor") + + def execute( + self, + array_value: bigframes.core.ArrayValue, + *, + ordered: bool = True, + col_id_overrides: Mapping[str, str] = {}, + use_explicit_destination: bool = False, + get_size_bytes: bool = False, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + ): + """ + Execute the ArrayValue, storing the result to a temporary session-owned table. + """ + raise NotImplementedError("execute not implemented for this executor") + + def export_gbq( + self, + array_value: bigframes.core.ArrayValue, + col_id_overrides: Mapping[str, str], + destination: bigquery.TableReference, + if_exists: Literal["fail", "replace", "append"] = "fail", + cluster_cols: Sequence[str] = [], + ) -> bigquery.QueryJob: + """ + Export the ArrayValue to an existing BigQuery table. + """ + raise NotImplementedError("export_gbq not implemented for this executor") + + def export_gcs( + self, + array_value: bigframes.core.ArrayValue, + col_id_overrides: Mapping[str, str], + uri: str, + format: Literal["json", "csv", "parquet"], + export_options: Mapping[str, Union[bool, str]], + ) -> bigquery.QueryJob: + """ + Export the ArrayValue to gcs. + """ + raise NotImplementedError("export_gcs not implemented for this executor") + + def dry_run( + self, array_value: bigframes.core.ArrayValue, ordered: bool = True + ) -> bigquery.QueryJob: + """ + Dry run executing the ArrayValue. + + Does not actually execute the data but will get stats and indicate any invalid query errors. + """ + raise NotImplementedError("dry_run not implemented for this executor") + + def peek( + self, + array_value: bigframes.core.ArrayValue, + n_rows: int, + ) -> ExecuteResult: + """ + A 'peek' efficiently accesses a small number of rows in the dataframe. + """ + raise NotImplementedError("peek not implemented for this executor") + + # TODO: Remove this and replace with efficient slice operator that can use execute() + def head( + self, array_value: bigframes.core.ArrayValue, n_rows: int + ) -> ExecuteResult: + """ + Preview the first n rows of the dataframe. This is less efficient than the unordered peek preview op. + """ + raise NotImplementedError("head not implemented for this executor") + + # TODO: This should be done through execute() + def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: + raise NotImplementedError("get_row_count not implemented for this executor") + + def cached( + self, + array_value: bigframes.core.ArrayValue, + *, + force: bool = False, + use_session: bool = False, + cluster_cols: Sequence[str] = (), + ) -> None: + raise NotImplementedError("cached not implemented for this executor") + + +class BigQueryCachingExecutor(Executor): """Computes BigFrames values using BigQuery Engine. This executor can cache expressions. If those expressions are executed later, this session @@ -94,6 +199,7 @@ def __init__( bqclient: bigquery.Client, storage_manager: bigframes.session.temp_storage.TemporaryGbqStorageManager, bqstoragereadclient: google.cloud.bigquery_storage_v1.BigQueryReadClient, + *, strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, ): @@ -117,9 +223,6 @@ def to_sql( ordered: bool = False, enable_cache: bool = True, ) -> str: - """ - Convert an ArrayValue to a sql query that will yield its value. - """ if offset_column: array_value, internal_offset_col = array_value.promote_offsets() col_id_overrides = dict(col_id_overrides) @@ -146,9 +249,6 @@ def execute( page_size: Optional[int] = None, max_results: Optional[int] = None, ): - """ - Execute the ArrayValue, storing the result to a temporary session-owned table. - """ if bigframes.options.compute.enable_multi_query_execution: self._simplify_with_caching(array_value) @@ -235,9 +335,6 @@ def export_gcs( format: Literal["json", "csv", "parquet"], export_options: Mapping[str, Union[bool, str]], ): - """ - Export the ArrayValue to gcs. - """ query_job = self.execute( array_value, ordered=False, @@ -259,11 +356,6 @@ def export_gcs( def dry_run( self, array_value: bigframes.core.ArrayValue, ordered: bool = True ) -> bigquery.QueryJob: - """ - Dry run executing the ArrayValue. - - Does not actually execute the data but will get stats and indicate any invalid query errors. - """ sql = self.to_sql(array_value, ordered=ordered) job_config = bigquery.QueryJobConfig(dry_run=True) bq_io.add_labels(job_config) @@ -298,14 +390,9 @@ def peek( total_rows=iterator.total_rows, ) - # This is used exclusively to optimize __repr__ - # TODO: We need to model this def head( self, array_value: bigframes.core.ArrayValue, n_rows: int ) -> ExecuteResult: - """ - Preview the first n rows of the dataframe. This is less efficient than the unordered peek preview op. - """ maybe_row_count = self._local_get_row_count(array_value) if (maybe_row_count is not None) and (maybe_row_count <= n_rows): return self.execute(array_value, ordered=True) @@ -341,7 +428,6 @@ def head( total_rows=iterator.total_rows, ) - # TODO: Remove. We shouldn't need this method, row count node can automatically be detected def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: count = self._local_get_row_count(array_value) if count is not None: @@ -354,6 +440,23 @@ def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: iter, _ = self._run_execute_query(sql) return next(iter)[0] + def cached( + self, + array_value: bigframes.core.ArrayValue, + *, + force: bool = False, + use_session: bool = False, + cluster_cols: Sequence[str] = (), + ) -> None: + """Write the block to a session table.""" + # use a heuristic for whether something needs to be cached + if (not force) and self._is_trivially_executable(array_value): + return + elif use_session: + self._cache_with_session_awareness(array_value) + else: + self._cache_with_cluster_cols(array_value, cluster_cols=cluster_cols) + def _local_get_row_count( self, array_value: bigframes.core.ArrayValue ) -> Optional[int]: @@ -379,11 +482,12 @@ def _run_execute_query( job_config.maximum_bytes_billed = ( bigframes.options.compute.maximum_bytes_billed ) - # Note: add_labels is global scope which may have unexpected effects - bq_io.add_labels(job_config, api_name=api_name) if not self.strictly_ordered: job_config.labels["bigframes-mode"] = "unordered" + + # Note: add_labels is global scope which may have unexpected effects + bq_io.add_labels(job_config, api_name=api_name) try: query_job = self.bqclient.query(sql, job_config=job_config) return ( diff --git a/bigframes/version.py b/bigframes/version.py index cdbacaa9cb..20e06673e1 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.26.0" +__version__ = "1.27.0" diff --git a/noxfile.py b/noxfile.py index 24aec29c6c..341de704e5 100644 --- a/noxfile.py +++ b/noxfile.py @@ -61,7 +61,7 @@ UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] -UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} # There are 4 different ibis-framework 9.x versions we want to test against. # 3.10 is needed for Windows tests. @@ -251,6 +251,7 @@ def mypy(session): "types-requests", "types-setuptools", "types-tabulate", + "polars", ] ) | set(SYSTEM_TEST_STANDARD_DEPENDENCIES) @@ -396,6 +397,8 @@ def doctest(session: nox.sessions.Session): "third_party", "--ignore", "third_party/bigframes_vendored/ibis", + "--ignore", + "bigframes/core/compile/polars", ), test_folder="bigframes", check_cov=True, diff --git a/setup.py b/setup.py index 833d4fe565..284a155d4f 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,8 @@ "tests": [ "pandas-gbq >=0.19.0", ], + # used for local engine, which is only needed for unit tests at present. + "polars": ["polars >= 1.7.0"], # Packages required for basic development flow. "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], } diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index 152fd168be..39368f490b 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -154,3 +154,13 @@ def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id assert reloaded_model.distance_type == "COSINE" assert reloaded_model.max_iter == 30 assert reloaded_model.tol == 0.001 + + +def test_model_centroids_with_custom_index(penguins_df_default_index): + model = cluster.KMeans(n_clusters=3) + penguins = penguins_df_default_index.set_index(["species", "island", "sex"]) + model.fit(penguins) + + assert ( + not model.cluster_centers_["feature"].isin(["species", "island", "sex"]).any() + ) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index f6ca26e7e4..96215c5e47 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -425,3 +425,30 @@ def test_logistic_regression_customized_params_fit_score( assert reloaded_model.tol == 0.02 assert reloaded_model.learning_rate_strategy == "CONSTANT" assert reloaded_model.learning_rate == 0.2 + + +def test_model_centroids_with_custom_index(penguins_df_default_index): + model = bigframes.ml.linear_model.LogisticRegression( + fit_intercept=False, + class_weight="balanced", + l2_reg=0.2, + tol=0.02, + l1_reg=0.2, + max_iterations=30, + optimize_strategy="batch_gradient_descent", + learning_rate_strategy="constant", + learning_rate=0.2, + ) + df = penguins_df_default_index.dropna().set_index(["species", "island"]) + X_train = df[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + ] + ] + y_train = df[["sex"]] + model.fit(X_train, y_train) + + # If this line executes without errors, the model has correctly ignored the custom index columns + model.predict(X_train.reset_index(drop=True)) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1fb12d3f82..f69eb2eb4a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1020,13 +1020,20 @@ def test_df_interpolate(scalars_dfs): ) -def test_df_fillna(scalars_dfs): +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df[["int64_col", "float64_col"]].fillna(3) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].fillna(3) + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) - pandas.testing.assert_frame_equal(bf_result, pd_result) + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_df_replace_scalar_scalar(scalars_dfs): @@ -3680,6 +3687,12 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): ) +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): pytest.skip("pandas filter items behavior different pre-2.1") diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index d68cf6c3f3..cdf4fa6511 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_index_equal(bf_result, pd_result) +def test_index_astype_error_error(session): + input = pd.Index(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + def test_index_any(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index("int64_col").index.any() pd_result = scalars_pandas_df_index.set_index("int64_col").index.any() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index b906f452b7..0cc8cd4cbe 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2768,20 +2768,25 @@ def test_series_case_when(scalars_dfs_maybe_ordered): # TODO(tswast): pandas case_when appears to assume True when a value is # null. I suspect this should be considered a bug in pandas. - bf_result = bf_series.case_when( - [ - ((bf_series > 100).fillna(True), bf_series - 1), - ((bf_series > 0).fillna(True), pd.NA), - ((bf_series < -100).fillna(True), -1000), - ] - ).to_pandas() - pd_result = pd_series.case_when( - [ - (pd_series > 100, pd_series - 1), - (pd_series > 0, pd.NA), - (pd_series < -100, -1000), - ] + + # Generate 150 conditions to test case_when with a large number of conditions + bf_conditions = ( + [((bf_series > 645).fillna(True), bf_series - 1)] + + [((bf_series > (-100 + i * 5)).fillna(True), i) for i in range(148, 0, -1)] + + [((bf_series <= -100).fillna(True), pd.NA)] + ) + + pd_conditions = ( + [((pd_series > 645), pd_series - 1)] + + [((pd_series > (-100 + i * 5)), i) for i in range(148, 0, -1)] + + [(pd_series <= -100, pd.NA)] ) + + assert len(bf_conditions) == 150 + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + pd.testing.assert_series_equal( bf_result, pd_result.astype(pd.Int64Dtype()), @@ -3087,6 +3092,7 @@ def foo(x): assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize("errors", ["raise", "null"]) @pytest.mark.parametrize( ("column", "to_type"), [ @@ -3102,6 +3108,7 @@ def foo(x): ("int64_col", "time64[us][pyarrow]"), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), + ("bool_col", "Float64"), ("string_col", "binary[pyarrow]"), ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and @@ -3137,12 +3144,29 @@ def foo(x): ], ) @skip_legacy_pandas -def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): - bf_result = scalars_df_index[column].astype(to_type).to_pandas() +def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): + bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result) +def test_astype_safe(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_series_astype_error_error(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + @skip_legacy_pandas def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): column = "numeric_col" diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 580ae80ef1..63ee080ad3 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1126,17 +1126,46 @@ def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder): @pytest.mark.parametrize( - ("engine",), + ("engine", "filename"), ( - ("auto",), - ("bigquery",), + pytest.param( + "auto", + "000000000000.parquet", + id="auto", + ), + pytest.param( + "pyarrow", + "000000000000.parquet", + id="pyarrow", + ), + pytest.param( + "bigquery", + "000000000000.parquet", + id="bigquery", + ), + pytest.param( + "bigquery", + "*.parquet", + id="bigquery_wildcard", + ), + pytest.param( + "auto", + "*.parquet", + id="auto_wildcard", + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), ), ) -def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, engine): +def test_read_parquet_gcs( + session: bigframes.Session, scalars_dfs, gcs_folder, engine, filename +): scalars_df, _ = scalars_dfs # Include wildcard so that multiple files can be written/read if > 1 GB. # https://cloud.google.com/bigquery/docs/exporting-data#exporting_data_into_one_or_more_files - path = gcs_folder + test_read_parquet_gcs.__name__ + "*.parquet" + write_path = gcs_folder + test_read_parquet_gcs.__name__ + "*.parquet" + read_path = gcs_folder + test_read_parquet_gcs.__name__ + filename df_in: bigframes.dataframe.DataFrame = scalars_df.copy() # GEOGRAPHY not supported in parquet export. @@ -1144,14 +1173,10 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, e # Make sure we can also serialize the order. df_write = df_in.reset_index(drop=False) df_write.index.name = f"ordering_id_{random.randrange(1_000_000)}" - df_write.to_parquet(path, index=True) - - # Only bigquery engine for reads supports wildcards in path name. - if engine != "bigquery": - path = utils.get_first_file_from_wildcard(path) + df_write.to_parquet(write_path, index=True) df_out = ( - session.read_parquet(path, engine=engine) + session.read_parquet(read_path, engine=engine) # Restore order. .set_index(df_write.index.name).sort_index() # Restore index. diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index 49c3d9e53c..e48479885d 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -30,3 +30,18 @@ def test_semantic_operators_set_true_shows_warning(): options.semantic_operators = True assert options.semantic_operators is True + + +def test_blob_default_false(): + options = experiment_options.ExperimentOptions() + + assert options.blob is False + + +def test_blob_set_true_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(UserWarning): + options.blob = True + + assert options.blob is True diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index ce05011546..b6409e9532 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -85,6 +85,17 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + + mock_X.join(mock_y).reset_index(drop=True).sql = "input_X_y_no_index_sql" + mock_X.join(mock_y).reset_index(drop=True).cache.return_value = mock_X.join( + mock_y + ).reset_index(drop=True) + mock_X.join(mock_y).reset_index(drop=True)._to_sql_query.return_value = ( + "input_X_y_no_index_sql", + ["index_column_id"], + ["index_column_label"], + ) + mock_X.cache.return_value = mock_X return mock_X @@ -107,7 +118,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -117,7 +128,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -150,7 +161,7 @@ def test_logistic_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy='auto_strategy',\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy='auto_strategy',\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -172,7 +183,7 @@ def test_logistic_regression_params_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy='batch_gradient_descent',\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy='constant',\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy='batch_gradient_descent',\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy='constant',\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) diff --git a/tests/unit/polars_session.py b/tests/unit/polars_session.py new file mode 100644 index 0000000000..76705ddd74 --- /dev/null +++ b/tests/unit/polars_session.py @@ -0,0 +1,93 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +from typing import Mapping, Optional, Union +import weakref + +import polars + +import bigframes +import bigframes.clients +import bigframes.core.blocks +import bigframes.core.compile.polars +import bigframes.core.ordering +import bigframes.dataframe +import bigframes.session.clients +import bigframes.session.executor +import bigframes.session.metrics + + +# Does not support to_sql, export_gbq, export_gcs, dry_run, peek, head, get_row_count, cached +@dataclasses.dataclass +class TestExecutor(bigframes.session.executor.Executor): + compiler = bigframes.core.compile.polars.PolarsCompiler() + + def execute( + self, + array_value: bigframes.core.ArrayValue, + *, + ordered: bool = True, + col_id_overrides: Mapping[str, str] = {}, + use_explicit_destination: bool = False, + get_size_bytes: bool = False, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + ): + """ + Execute the ArrayValue, storing the result to a temporary session-owned table. + """ + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + pa_table = lazy_frame.collect().to_arrow() + # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. + # Nullability may be different, and might use large versions of list, string datatypes. + return bigframes.session.executor.ExecuteResult( + arrow_batches=lambda: pa_table.to_batches(), + schema=array_value.schema, + total_bytes=pa_table.nbytes, + total_rows=pa_table.num_rows, + ) + + +class TestSession(bigframes.session.Session): + def __init__(self): + self._location = None # type: ignore + self._bq_kms_key_name = None # type: ignore + self._clients_provider = None # type: ignore + self.ibis_client = None # type: ignore + self._bq_connection = None # type: ignore + self._skip_bq_connection_check = True + self._session_id: str = "test_session" + self._objects: list[ + weakref.ReferenceType[ + Union[ + bigframes.core.indexes.Index, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ] + ] + ] = [] + self._strictly_ordered: bool = True + self._allow_ambiguity = False # type: ignore + self._default_index_type = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + self._metrics = bigframes.session.metrics.ExecutionMetrics() + self._remote_function_session = None # type: ignore + self._temp_storage_manager = None # type: ignore + self._executor = TestExecutor() + self._loader = None # type: ignore + + def read_pandas(self, pandas_dataframe): + # override read_pandas to always keep data local-only + local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) + return bigframes.dataframe.DataFrame(local_block) diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 04db840b28..48ceacfb91 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -13,22 +13,20 @@ # limitations under the License. import datetime -from typing import Dict, List, Optional, Sequence +from typing import Optional, Sequence import unittest.mock as mock import google.auth.credentials import google.cloud.bigquery -import ibis -import pandas -import pyarrow as pa import pytest import bigframes import bigframes.clients -import bigframes.core as core import bigframes.core.ordering import bigframes.dataframe import bigframes.session.clients +import bigframes.session.executor +import bigframes.session.metrics """Utilities for creating test resources.""" @@ -125,24 +123,3 @@ def create_dataframe( monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True return bigframes.dataframe.DataFrame({"col": []}, session=session) - - -def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: - # TODO(tswast): Refactor to make helper available for all tests. Consider - # providing a proper "local Session" for use by downstream developers. - session = mock.create_autospec(bigframes.Session, instance=True) - ibis_client = ibis.pandas.connect(tables) - type(session).ibis_client = mock.PropertyMock(return_value=ibis_client) - return session - - -def create_arrayvalue( - df: pandas.DataFrame, total_ordering_columns: List[str] -) -> core.ArrayValue: - session = create_pandas_session({"test_table": df}) - return core.ArrayValue.from_pyarrow( - arrow_table=pa.Table.from_pandas(df, preserve_index=False), - session=session, - ).order_by( - [bigframes.core.ordering.ascending_over(col) for col in total_ordering_columns] - ) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py new file mode 100644 index 0000000000..4697c84960 --- /dev/null +++ b/tests/unit/test_local_engine.py @@ -0,0 +1,218 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pyarrow as pa +import pytest + +import bigframes +import bigframes.pandas as bpd +from tests.system.utils import skip_legacy_pandas + +pytest.importorskip("polars") + + +# All tests in this file require polars to be installed to pass. +@pytest.fixture(scope="module") +def polars_session(): + from . import polars_session + + return polars_session.TestSession() + + +@pytest.fixture(scope="module") +def small_inline_frame() -> pd.DataFrame: + df = pd.DataFrame( + { + "int1": pd.Series([1, 2, 3], dtype="Int64"), + "int2": pd.Series([-10, 20, 30], dtype="Int64"), + "bools": pd.Series([True, None, False], dtype="boolean"), + "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"), + "intLists": pd.Series( + [[1, 2, 3], [4, 5, 6, 7], None], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ), + }, + ) + df.index = df.index.astype("Int64") + return df + + +# These tests should be unit tests, but Session object is tightly coupled to BigQuery client. +@skip_legacy_pandas +def test_polars_local_engine_add( + small_inline_frame: pd.DataFrame, polars_session: bigframes.Session +): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = (bf_df["int1"] + bf_df["int2"]).to_pandas() + pd_result = pd_df.int1 + pd_df.int2 + pandas.testing.assert_series_equal(bf_result, pd_result) + + +@skip_legacy_pandas +def test_polars_local_engine_order_by(small_inline_frame: pd.DataFrame, polars_session): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.sort_values("strings").to_pandas() + pd_result = pd_df.sort_values("strings") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@skip_legacy_pandas +def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_session): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.filter(bf_df["int2"] >= 1).to_pandas() + pd_result = pd_df.filter(pd_df["int2"] >= 1) # type: ignore + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@skip_legacy_pandas +def test_polars_local_engine_reset_index( + small_inline_frame: pd.DataFrame, polars_session +): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.reset_index().to_pandas() + pd_result = pd_df.reset_index() + # pd default index is int64, bf is Int64 + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@skip_legacy_pandas +def test_polars_local_engine_join_binop(polars_session): + pd_df_1 = pd.DataFrame({"colA": [1, None, 3], "colB": [3, 1, 2]}, index=[1, 2, 3]) + pd_df_2 = pd.DataFrame( + {"colA": [100, 200, 300], "colB": [30, 10, 40]}, index=[2, 1, 4] + ) + bf_df_1 = bpd.DataFrame(pd_df_1, session=polars_session) + bf_df_2 = bpd.DataFrame(pd_df_2, session=polars_session) + + bf_result = (bf_df_1 + bf_df_2).to_pandas() + pd_result = pd_df_1 + pd_df_2 + # Sort since different join ordering + pandas.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_dtype=False, + check_index_type=False, + ) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + "join_type", + ["inner", "left", "right", "outer"], +) +def test_polars_local_engine_joins(join_type, polars_session): + pd_df_1 = pd.DataFrame( + {"colA": [1, None, 3], "colB": [3, 1, 2]}, index=[1, 2, 3], dtype="Int64" + ) + pd_df_2 = pd.DataFrame( + {"colC": [100, 200, 300], "colD": [30, 10, 40]}, index=[2, 1, 4], dtype="Int64" + ) + bf_df_1 = bpd.DataFrame(pd_df_1, session=polars_session) + bf_df_2 = bpd.DataFrame(pd_df_2, session=polars_session) + + bf_result = bf_df_1.join(bf_df_2, how=join_type).to_pandas() + pd_result = pd_df_1.join(pd_df_2, how=join_type) + # Sort by index because ordering logic isn't same as pandas + pandas.testing.assert_frame_equal( + bf_result.sort_index(), pd_result.sort_index(), check_index_type=False + ) + + +@skip_legacy_pandas +def test_polars_local_engine_agg(polars_session): + pd_df = pd.DataFrame( + {"colA": [True, False, True, False, True], "colB": [1, 2, 3, 4, 5]} + ) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.agg(["sum", "count"]).to_pandas() + pd_result = pd_df.agg(["sum", "count"]) + # local engine appears to produce uint32 + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False, check_index_type=False) # type: ignore + + +@skip_legacy_pandas +def test_polars_local_engine_groupby_sum(polars_session): + pd_df = pd.DataFrame( + {"colA": [True, False, True, False, True], "colB": [1, 2, 3, 4, 5]} + ) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.groupby("colA").sum().to_pandas() + pd_result = pd_df.groupby("colA").sum() + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@skip_legacy_pandas +def test_polars_local_engine_cumsum(small_inline_frame, polars_session): + pd_df = small_inline_frame[["int1", "int2"]] + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.cumsum().to_pandas() + pd_result = pd_df.cumsum() + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@skip_legacy_pandas +def test_polars_local_engine_explode(small_inline_frame, polars_session): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.explode(["intLists"]).to_pandas() + pd_result = pd_df.explode(["intLists"]) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50_000_000_000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50_000_000_000), + (-1, -7, -2), + (None, -7, -2), + (-1, None, -2), + (-7, -1, 2), + (-7, -1, None), + (-7, 7, None), + (7, -7, -2), + ], +) +@skip_legacy_pandas +def test_polars_local_engine_slice( + small_inline_frame, polars_session, start, stop, step +): + pd_df = small_inline_frame + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + bf_result = bf_df.iloc[start:stop:step].to_pandas() + pd_result = pd_df.iloc[start:stop:step] + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 101cdc5bd9..83a24f7a9c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -180,6 +180,10 @@ def astype(self, dtype): ``pd.ArrowDtype(pa.time64("us"))``, ``pd.ArrowDtype(pa.timestamp("us"))``, ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. + errors ({'raise', 'null'}, default 'raise'): + Control raising of exceptions on invalid data for provided dtype. + If 'raise', allow exceptions to be raised if any value fails cast + If 'null', will assign null value if value fails cast Returns: bigframes.pandas.DataFrame: diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 54320d8116..1e30d827ca 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -997,6 +997,83 @@ def expanding(self, *args, **kwargs): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def head(self, n: int = 5): + """ + Return last first n rows of each group + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame([[1, 2], [1, 4], [5, 6]], + ... columns=['A', 'B']) + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + [2 rows x 2 columns] + + Args: + n (int): + If positive: number of entries to include from start of each group. + If negative: number of entries to exclude from end of each group. + + Returns: + bigframes.pandas.DataFrame or bigframes.pandas.Series: + First n rows of the original DataFrame or Series + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def size(self): + """ + Compute group sizes. + + **Examples:** + + For SeriesGroupBy: + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> lst = ['a', 'a', 'b'] + >>> ser = bpd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: Int64 + >>> ser.groupby(level=0).size() + a 2 + b 1 + dtype: Int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = bpd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + [3 rows x 3 columns] + >>> df.groupby("a").size() + a + 1 2 + 7 1 + dtype: Int64 + + Returns: + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Number of rows in each group as a Series if as_index is True + or a DataFrame if as_index is False. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class SeriesGroupBy(GroupBy): def agg(self, func): diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index b0e1a09392..763702ef6f 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -105,7 +105,25 @@ def astype(self, dtype): impossible, a TypeError exception is raised. Args: - dtype (numpy dtype or pandas type): + dtype (str or pandas.ExtensionDtype): + A dtype supported by BigQuery DataFrame include ``'boolean'``, + ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, + ``'string'``, ``'string\\[pyarrow\\]'``, + ``'timestamp\\[us, tz=UTC\\]\\[pyarrow\\]'``, + ``'timestamp\\[us\\]\\[pyarrow\\]'``, + ``'date32\\[day\\]\\[pyarrow\\]'``, + ``'time64\\[us\\]\\[pyarrow\\]'``. + A pandas.ExtensionDtype include ``pandas.BooleanDtype()``, + ``pandas.Float64Dtype()``, ``pandas.Int64Dtype()``, + ``pandas.StringDtype(storage="pyarrow")``, + ``pd.ArrowDtype(pa.date32())``, + ``pd.ArrowDtype(pa.time64("us"))``, + ``pd.ArrowDtype(pa.timestamp("us"))``, + ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. + errors ({'raise', 'null'}, default 'raise'): + Control raising of exceptions on invalid data for provided dtype. + If 'raise', allow exceptions to be raised if any value fails cast + If 'null', will assign null value if value fails cast Returns: Index: Index with values cast to specified dtype. diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index cdbacaa9cb..20e06673e1 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.26.0" +__version__ = "1.27.0"