From c2009ed1b7a10958c92363d8213397b6e25741ea Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 24 Oct 2023 00:14:53 +0000 Subject: [PATCH 01/32] docs: link to ML.EVALUATE BQML page for score() methods --- bigframes/ml/ensemble.py | 10 ++++++++++ bigframes/ml/forecasting.py | 5 +++++ third_party/bigframes_vendored/sklearn/base.py | 12 +++++++++++- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 8 ++++++-- .../bigframes_vendored/sklearn/decomposition/_pca.py | 7 ++++++- 5 files changed, 38 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 113ad872b5..764f00ed12 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,6 +507,11 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -676,6 +681,11 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8a6de1dd81..b88518a843 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,6 +112,11 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 42868ce51f..4d039be60d 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,6 +85,11 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -105,7 +110,12 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Return the evaluation metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ece62dc147..7b22bb4560 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -12,7 +12,6 @@ # License: BSD 3 clause from abc import ABC -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -83,7 +82,12 @@ def score( X, y=None, ): - """Metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 97fee5a501..0326a10c2d 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,7 +55,12 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Return the metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (default None): From 09ad5e4f2d062b66ffed7dfce3fd25dffb354ca4 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 25 Oct 2023 23:07:13 +0000 Subject: [PATCH 02/32] feat: label query job with bigframes-api-xx using decorator --- bigframes/core/groupby/__init__.py | 3 + bigframes/core/window/__init__.py | 2 + bigframes/dataframe.py | 2 + bigframes/operations/datetimes.py | 2 + bigframes/operations/strings.py | 2 + bigframes/operations/structs.py | 2 + bigframes/series.py | 2 + bigframes/session/__init__.py | 15 +++- bigframes/session/_io/bigquery.py | 32 ++++++- bigframes/utils/__init__.py | 17 ++++ bigframes/utils/log_adapter.py | 32 +++++++ setup.py | 1 + tests/unit/session/test_io_bigquery.py | 115 ++++++++++++++++++++++++- 13 files changed, 220 insertions(+), 7 deletions(-) create mode 100644 bigframes/utils/__init__.py create mode 100644 bigframes/utils/log_adapter.py diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index db0843fcbc..fb2043bea7 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -30,9 +30,11 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby +@log_adapter.class_logger class DataFrameGroupBy(vendored_pandas_groupby.DataFrameGroupBy): __doc__ = vendored_pandas_groupby.GroupBy.__doc__ @@ -408,6 +410,7 @@ def _resolve_label(self, label: blocks.Label) -> str: return col_ids[0] +@log_adapter.class_logger class SeriesGroupBy(vendored_pandas_groupby.SeriesGroupBy): __doc__ = vendored_pandas_groupby.GroupBy.__doc__ diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index d3d081124e..d84af70bfa 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -19,9 +19,11 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling +@log_adapter.class_logger class Window(vendored_pandas_rolling.Window): __doc__ = vendored_pandas_rolling.Window.__doc__ diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5c0d9b78e1..04a213d7e8 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -56,6 +56,7 @@ import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing @@ -80,6 +81,7 @@ # Inherits from pandas DataFrame so that we can use the same docstrings. +@log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): __doc__ = vendored_pandas_frame.DataFrame.__doc__ diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 1b20c2d593..7275b7ee34 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -17,9 +17,11 @@ import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.indexes.accessor as vendordt +@log_adapter.class_logger class DatetimeMethods( bigframes.operations.base.SeriesMethods, vendordt.DatetimeProperties ): diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 0545ea34d6..6102a63bc5 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -22,6 +22,7 @@ import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.strings.accessor as vendorstr # Maps from python to re2 @@ -32,6 +33,7 @@ } +@log_adapter.class_logger class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMethods): __doc__ = vendorstr.StringMethods.__doc__ diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index 506a557709..bdf759371e 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -22,6 +22,7 @@ import bigframes.operations import bigframes.operations.base import bigframes.series +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors @@ -38,6 +39,7 @@ def _as_ibis(self, x: ibis_types.Value): return struct_value[name].name(name) +@log_adapter.class_logger class StructAccessor( bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor ): diff --git a/bigframes/series.py b/bigframes/series.py index 49df8ab61e..7493e10c5a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -52,12 +52,14 @@ import bigframes.operations.datetimes as dt import bigframes.operations.strings as strings import bigframes.operations.structs as structs +from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] +@log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): def __init__(self, *args, **kwargs): self._query_job: Optional[bigquery.QueryJob] = None diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index db9c5a353c..97d11066a6 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -75,6 +75,7 @@ from bigframes.remote_function import remote_function as bigframes_rf import bigframes.session._io.bigquery as bigframes_io import bigframes.session.clients +from bigframes.utils import log_adapter import bigframes.version # Even though the ibis.backends.bigquery.registry import is unused, it's needed @@ -110,6 +111,7 @@ def _is_query(query_or_table: str) -> bool: return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None +@log_adapter.class_logger class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -1496,10 +1498,19 @@ def _start_query( """ Starts query job and waits for results """ + api_methods = log_adapter._api_methods if job_config is not None: - query_job = self.bqclient.query(sql, job_config=job_config) + job_config.labels = bigframes_io.create_job_configs_labels( + job_configs_labels=job_config.labels, api_methods=api_methods + ) else: - query_job = self.bqclient.query(sql) + job_config = bigquery.QueryJobConfig() + job_config.labels = bigframes_io.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + query_job = self.bqclient.query(sql, job_config=job_config) + # Clear out the global api logger + log_adapter._api_methods = [] opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index d47efbdddc..511ec292a1 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -17,11 +17,41 @@ import datetime import textwrap import types -from typing import Dict, Iterable, Union +from typing import Dict, Iterable, Optional, Sequence, Union import google.cloud.bigquery as bigquery IO_ORDERING_ID = "bqdf_row_nums" +MAX_LABELS_COUNT = 64 + + +def create_job_configs_labels( + job_configs_labels: Optional[Dict[str, str]], + api_methods: Sequence[str], +) -> Dict[str, str]: + # If there is no label set + if job_configs_labels is None: + labels = {} + label_values = list(api_methods) + else: + labels = job_configs_labels.copy() + cur_labels_len = len(job_configs_labels) + api_methods_len = len(api_methods) + # If the total number of labels is under the limit of labels count + if cur_labels_len + api_methods_len <= MAX_LABELS_COUNT: + label_values = list(api_methods) + # We capture the latest label if it is out of the length limit of labels count + else: + added_api_len = cur_labels_len + api_methods_len - MAX_LABELS_COUNT + label_values = list(api_methods)[-added_api_len:] + + for i, label_value in enumerate(label_values): + if job_configs_labels is not None: + label_key = "bigframes-api-" + str(i + len(job_configs_labels)) + else: + label_key = "bigframes-api-" + str(i) + labels[label_key] = label_value + return labels def create_export_csv_statement( diff --git a/bigframes/utils/__init__.py b/bigframes/utils/__init__.py new file mode 100644 index 0000000000..82f1eeda55 --- /dev/null +++ b/bigframes/utils/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.utils.log_adapter import class_logger, method_logger + +__all__ = ["class_logger", "method_logger"] diff --git a/bigframes/utils/log_adapter.py b/bigframes/utils/log_adapter.py new file mode 100644 index 0000000000..d498b9a06f --- /dev/null +++ b/bigframes/utils/log_adapter.py @@ -0,0 +1,32 @@ +import functools + +from loguru import logger + +_log_file_path = None +_logger = logger +_api_methods = [] + + +def class_logger(decorated_cls): + """Decorator that adds logging functionality to each method of the class.""" + for attr_name, attr_value in decorated_cls.__dict__.items(): + if callable(attr_value): + setattr(decorated_cls, attr_name, method_logger(attr_value)) + return decorated_cls + + +def method_logger(method): + """Decorator that adds logging functionality to a method.""" + + @functools.wraps(method) + def wrapper(*args, **kwargs): + api_method_name = str(method.__name__) + if not api_method_name.startswith("__"): + add_api_method(api_method_name) + + return wrapper + + +def add_api_method(method: str): + global _api_methods + _api_methods.append(method) diff --git a/setup.py b/setup.py index 29eacb74a9..b57a784a2c 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ "sqlalchemy >=1.4,<3.0dev", "ipywidgets >=7.7.1", "humanize >= 4.6.0", + "logruru >=0.6.0", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index d2255d5edf..c36e0ed19a 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -18,7 +18,114 @@ import google.cloud.bigquery as bigquery import pytest -import bigframes.session._io.bigquery +import bigframes.pandas as bpd +import bigframes.session._io.bigquery as io_bq +from bigframes.utils import log_adapter + + +def test_create_job_configs_labels_is_none(): + api_methods = ["df-agg", "series-mode"] + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + expected_dict = {"bigframes-api-0": "df-agg", "bigframes-api-1": "series-mode"} + assert labels is not None + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_not_met(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + api_methods = ["df-agg", "series-mode"] + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "bigframes-api-2": "df-agg", + "bigframes-api-3": "series-mode", + } + assert labels is not None + assert len(labels) == 4 + assert labels == expected_dict + + +def test_create_job_configs_labels_log_adaptor_under_length_limit(): + log_adapter._api_methods = ["df-agg", "series-mode"] + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + api_methods = log_adapter._api_methods + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "bigframes-api-2": "df-agg", + "bigframes-api-3": "series-mode", + } + assert labels is not None + assert len(labels) == 4 + assert labels == expected_dict + + +def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + log_adapter._api_methods = [] + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "bigframes-api-2": "head", + "bigframes-api-3": "max", + } + assert labels is not None + assert len(labels) == 4 + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_met(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + for i in range(61): + key = f"bigframes-api-{i}" + value = f"test{i}" + cur_labels[key] = value + # If cur_labels length is 63, we can only add one label from api_methods + log_adapter._api_methods = [] + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "head" not in labels.values() + assert "max" in labels.values() + assert "bigframes-api" in labels.keys() + assert "source" in labels.keys() def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): @@ -26,7 +133,7 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): "my-test-project._e8166e0cdb.anonbb92cd" ) - sql = bigframes.session._io.bigquery.create_snapshot_sql( + sql = io_bq.create_snapshot_sql( table_ref, datetime.datetime.now(datetime.timezone.utc) ) @@ -40,7 +147,7 @@ def test_create_snapshot_sql_doesnt_timetravel_anonymous_datasets(): def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): table_ref = bigquery.TableReference.from_string("my-test-project._session.abcdefg") - sql = bigframes.session._io.bigquery.create_snapshot_sql( + sql = io_bq.create_snapshot_sql( table_ref, datetime.datetime.now(datetime.timezone.utc) ) @@ -101,5 +208,5 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets(): ), ) def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str): - sql = bigframes.session._io.bigquery.bq_schema_to_sql(schema) + sql = io_bq.bq_schema_to_sql(schema) assert sql == expected From 4f4eb9b083e23784d20b6506fe83b964c8874af0 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 25 Oct 2023 23:21:27 +0000 Subject: [PATCH 03/32] reorganize the commit --- bigframes/ml/ensemble.py | 10 ---------- bigframes/ml/forecasting.py | 5 ----- third_party/bigframes_vendored/sklearn/base.py | 12 +----------- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 7 +------ .../bigframes_vendored/sklearn/decomposition/_pca.py | 7 +------ 5 files changed, 3 insertions(+), 38 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 764f00ed12..113ad872b5 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,11 +507,6 @@ def score( ): """Calculate evaluation metrics of the model. - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. - Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -681,11 +676,6 @@ def score( ): """Calculate evaluation metrics of the model. - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. - Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index b88518a843..8a6de1dd81 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,11 +112,6 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. - Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 4d039be60d..42868ce51f 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,11 +85,6 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. - Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -110,12 +105,7 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Calculate evaluation metrics of the model. - - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + """Return the evaluation metrics of the model. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 7b22bb4560..731346ffc9 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -82,12 +82,7 @@ def score( X, y=None, ): - """Calculate evaluation metrics of the model. - - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + """Metrics of the model. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 0326a10c2d..97fee5a501 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,12 +55,7 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Calculate evaluation metrics of the model. - - .. note:: - - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + """Return the metrics of the model. Args: X (default None): From 272f0af4c1331a7de04d3e2944817a750b8afae7 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 17:44:14 +0000 Subject: [PATCH 04/32] test: Log slowest tests durations (#146) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- noxfile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 1864da9fe7..d0bbda80fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -305,8 +305,10 @@ def run_system( "py.test", "--quiet", "-n=20", - # Any individual test taking longer than 10 mins will be terminated. + # Any individual test taking longer than 15 mins will be terminated. "--timeout=900", + # Log 20 slowest tests + "--durations=20", f"--junitxml={prefix_name}_{session.python}_sponge_log.xml", ] if print_duration: From 0e4c49c4122ba040a309758716c9c995e6731d62 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 26 Oct 2023 11:36:14 -0700 Subject: [PATCH 05/32] docs: link to ML.EVALUATE BQML page for score() methods (#137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/ensemble.py | 12 ++++++++++++ bigframes/ml/forecasting.py | 6 ++++++ third_party/bigframes_vendored/sklearn/base.py | 14 +++++++++++++- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 8 +++++++- .../sklearn/decomposition/_pca.py | 8 +++++++- 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 113ad872b5..19ca8608ff 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,6 +507,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -676,6 +682,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8a6de1dd81..8e309d5e73 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,6 +112,12 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 42868ce51f..768328e552 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,6 +85,12 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -105,7 +111,13 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Return the evaluation metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 731346ffc9..5369d3662d 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -82,7 +82,13 @@ def score( X, y=None, ): - """Metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#k-means_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 97fee5a501..011ecc06dd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,7 +55,13 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Return the metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models + for the outputs relevant to this model type. Args: X (default None): From aad2c1a013895afd8b619186d3d4b5a23b565e36 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:30:15 -0700 Subject: [PATCH 06/32] feat: populate ibis version in user agent (#140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/session/clients.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 544f74265f..e33413002f 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -28,12 +28,13 @@ import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 +import ibis import pydata_google_auth import bigframes.version _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" -_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__}" +_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__} ibis/{ibis.__version__}" _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] # BigQuery is a REST API, which requires the protocol as part of the URL. From 1043d6da6908d639ddb9d5a4b1b78041eb757327 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 15:12:14 -0500 Subject: [PATCH 07/32] fix: don't override the global logging config (#138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/clients.py | 3 --- bigframes/remote_function.py | 5 ----- 2 files changed, 8 deletions(-) diff --git a/bigframes/clients.py b/bigframes/clients.py index 4ba9d93d69..de2421e499 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -24,9 +24,6 @@ from google.cloud import bigquery_connection_v1, resourcemanager_v3 from google.iam.v1 import iam_policy_pb2, policy_pb2 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index c82ba84056..a39cd033f6 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -53,11 +53,6 @@ from bigframes import clients import bigframes.constants as constants -# TODO(shobs): Change the min log level to INFO after the development stabilizes -# before June 2023 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) # Protocol version 4 is available in python version 3.4 and above From 1f49ef9a62f38070ca25dca5a31ad471e7cb56c0 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 26 Oct 2023 14:01:04 -0700 Subject: [PATCH 08/32] fix: use indexee's session for loc listlike cases (#152) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/indexers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 4f5a9471b9..d18a0a38ef 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -310,7 +310,9 @@ def _loc_getitem_series_or_dataframe( index_name = temporary_index_names[i] values = [entry[i] for entry in key] index_cols_dict[index_name] = values - keys_df = bigframes.dataframe.DataFrame(index_cols_dict) + keys_df = bigframes.dataframe.DataFrame( + index_cols_dict, session=series_or_dataframe._get_block().expr._session + ) keys_df = keys_df.set_index(temporary_index_names, drop=True) keys_df = keys_df.rename_axis(original_index_names) else: @@ -320,7 +322,10 @@ def _loc_getitem_series_or_dataframe( index_name_is_none = index_name is None if index_name_is_none: index_name = "unnamed_col" - keys_df = bigframes.dataframe.DataFrame({index_name: key}) + keys_df = bigframes.dataframe.DataFrame( + {index_name: key}, + session=series_or_dataframe._get_block().expr._session, + ) keys_df = keys_df.set_index(index_name, drop=True) if index_name_is_none: keys_df.index.name = None From c4c1e6e249878365a8530f4a4164399e4be300bd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 14:40:13 -0700 Subject: [PATCH 09/32] feat: add pandas.qcut (#104) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/reshape/__init__.py | 33 ++++++++++++ bigframes/operations/aggregations.py | 51 +++++++++++++++++++ bigframes/pandas/__init__.py | 13 +++++ tests/system/small/test_pandas.py | 25 +++++++++ .../pandas/core/reshape/tile.py | 30 +++++++++++ 5 files changed, 152 insertions(+) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 339ce7466a..dc61c3baad 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.core as core import bigframes.core.utils as utils import bigframes.dataframe +import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series @@ -118,3 +119,35 @@ def cut( f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" ) return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + + +def qcut( + x: bigframes.series.Series, + q: typing.Union[int, typing.Sequence[float]], + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + if isinstance(q, int) and q <= 0: + raise ValueError("`q` should be a positive integer.") + + if labels is not False: + raise NotImplementedError( + f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + if duplicates != "drop": + raise NotImplementedError( + f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + block = x._block + label = block.col_id_to_label[x._value_column] + block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op) + block, result = block.apply_window_op( + x._value_column, + agg_ops.QcutOp(q), + window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + ) + block, result = block.apply_binary_op( + result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + ) + return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 23271e8220..465d188724 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -254,6 +254,53 @@ def handles_ties(self): return True +class QcutOp(WindowOp): + def __init__(self, quantiles: typing.Union[int, typing.Sequence[float]]): + self.name = f"qcut-{quantiles}" + self._quantiles = quantiles + + @numeric_op + def _as_ibis( + self, column: ibis_types.Column, window=None + ) -> ibis_types.IntegerValue: + if isinstance(self._quantiles, int): + quantiles_ibis = dtypes.literal_to_ibis_scalar(self._quantiles) + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + float_bucket = typing.cast( + ibis_types.FloatingColumn, (percent_ranks * quantiles_ibis) + ) + return float_bucket.ceil().clip(lower=_ibis_num(1)) - _ibis_num(1) + else: + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + out = ibis.case() + first_ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[0]) + out = out.when(percent_ranks < first_ibis_quantile, None) + for bucket_n in range(len(self._quantiles) - 1): + ibis_quantile = dtypes.literal_to_ibis_scalar( + self._quantiles[bucket_n + 1] + ) + out = out.when( + percent_ranks <= ibis_quantile, + dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), + ) + out = out.else_(None) + return out.end() + + @property + def skips_nulls(self): + return False + + @property + def handles_ties(self): + return True + + class NuniqueOp(AggregateOp): name = "nunique" @@ -491,3 +538,7 @@ def lookup_agg_func(key: str) -> AggregateOp: return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5c1928e6f0..8d9726312f 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -134,6 +134,19 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def qcut( + x: bigframes.series.Series, + q: int, + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + return bigframes.core.reshape.qcut(x, q, labels=labels, duplicates=duplicates) + + +qcut.__doc__ = vendored_pandas_tile.qcut.__doc__ + + def merge( left: DataFrame, right: DataFrame, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a429c6551d..f8fa78587f 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -223,3 +223,28 @@ def test_cut(scalars_dfs): bf_result = bf_result.to_pandas() pd_result = pd_result.astype("Int64") pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("q",), + [ + (1,), + (2,), + (7,), + (32,), + ([0, 0.1, 0.3, 0.4, 0.9, 1.0],), + ([0.5, 0.9],), + ], +) +def test_qcut(scalars_dfs, q): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.qcut( + scalars_pandas_df["float64_col"], q, labels=False, duplicates="drop" + ) + bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") + + bf_result = bf_result.to_pandas() + pd_result = pd_result.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 4f5f2efef0..24ea655a5f 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -65,3 +65,33 @@ def cut( False : returns an ndarray of integers. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def qcut(x, q, *, labels=None, duplicates="error"): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Args: + x (Series): + The input Series to be binned. Must be 1-dimensional. + q (int or list-like of float): + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels (None): + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + duplicates ({default 'raise', 'drop'}, optional): + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns: + Series: Categorical or Series of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 4a27f440eb6219bd0750d356adba12441e6054af Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 15:20:15 -0700 Subject: [PATCH 10/32] feat: add unstack to series, add level param (#115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/blocks.py | 26 ++++++++++-- bigframes/dataframe.py | 32 +++++---------- bigframes/series.py | 40 +++++++++++-------- tests/system/conftest.py | 8 +++- tests/system/small/test_dataframe.py | 10 ++++- tests/system/small/test_multiindex.py | 31 ++++++++++++-- .../bigframes_vendored/pandas/core/series.py | 13 ++++++ 7 files changed, 112 insertions(+), 48 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index eab4645477..e8a3968b3d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -67,6 +67,10 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] + + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -1423,9 +1427,7 @@ def _get_unique_values( raise ValueError(f"Too many unique values: {pd_values}") if len(columns) > 1: - return pd.MultiIndex.from_frame( - pd_values.sort_values(by=list(pd_values.columns), na_position="first") - ) + return pd.MultiIndex.from_frame(pd_values) else: return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first")) @@ -1611,6 +1613,24 @@ def cached(self) -> Block: index_labels=self.index_labels, ) + def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]: + if utils.is_list_like(level): + levels = list(level) + else: + levels = [level] + resolved_level_ids = [] + for level_ref in levels: + if isinstance(level_ref, int): + resolved_level_ids.append(self.index_columns[level_ref]) + elif isinstance(level_ref, typing.Hashable): + matching_ids = self.index_name_to_col_id.get(level_ref, []) + if len(matching_ids) != 1: + raise ValueError("level name cannot be found or is ambiguous") + resolved_level_ids.append(matching_ids[0]) + else: + raise ValueError(f"Unexpected level: {level_ref}") + return resolved_level_ids + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 04a213d7e8..6b79f463ef 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1040,22 +1040,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): raise ValueError("Columns must be a multiindex to reorder levels.") def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if utils.is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: block = self._block.rename(columns=columns) @@ -1804,20 +1789,25 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) - def unstack(self): + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + block = self._block # Special case, unstack with mono-index transpose into a series if self.index.nlevels == 1: block = block.stack(how="right", levels=self.columns.nlevels) return bigframes.series.Series(block) - # Pivot by last level of index - index_ids = block.index_columns + # Pivot by index levels + unstack_ids = self._resolve_levels(level) block = block.reset_index(drop=False) - block = block.set_index(index_ids[:-1]) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) pivot_block = block.pivot( - columns=[index_ids[-1]], + columns=unstack_ids, values=self._block.value_columns, values_in_index=True, ) diff --git a/bigframes/series.py b/bigframes/series.py index 7493e10c5a..b65581a3ac 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -354,22 +354,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): return Series(self._block.reorder_levels(resolved_level_ids)) def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if _is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def between(self, left, right, inclusive="both"): if inclusive not in ["both", "neither", "left", "right"]: @@ -920,6 +905,29 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + + block = self._block + + if self.index.nlevels == 1: + raise ValueError("Series must have multi-index to unstack") + + # Pivot by index levels + unstack_ids = self._resolve_levels(level) + block = block.reset_index(drop=False) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) + + pivot_block = block.pivot( + columns=unstack_ids, + values=self._block.value_columns, + values_in_index=False, + ) + return bigframes.dataframe.DataFrame(pivot_block) + def idxmax(self) -> blocks.Label: block = self._block.order_by( [ diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb664302a8..8885b03d34 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -400,7 +400,11 @@ def hockey_df( hockey_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - return session.read_gbq(hockey_table_id) + return ( + session.read_gbq(hockey_table_id) + .set_index(["player_name", "season"]) + .sort_index() + ) @pytest.fixture(scope="session") @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame: "season": pd.Int64Dtype(), }, ) - df.index = df.index.astype("Int64") + df = df.set_index(["player_name", "season"]).sort_index() return df diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 84e8def83b..a746a1867c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns): ], ) def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas() - pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns) + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) # Pandas produces NaN, where bq dataframes produces pd.NA pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b5c78de69c..a87dacae04 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_multi_index_unstack(hockey_df, hockey_pandas_df): +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level): bf_result = ( - hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas() + hockey_df.set_index(["team_name", "position"], append=True) + .unstack(level=level) + .to_pandas() ) pd_result = hockey_pandas_df.set_index( - ["team_name", "season", "position"] - ).unstack() + ["team_name", "position"], append=True + ).unstack(level=level) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level): + bf_result = ( + hockey_df.set_index(["team_name", "position"], append=True)["number"] + .unstack(level=level) + .to_pandas() + ) + pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[ + "number" + ].unstack(level=level) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index bd1f9a9a18..f0e13e16f5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1654,6 +1654,19 @@ def clip(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unstack(self, level): + """ + Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + + Args: + level (int, str, or list of these, default last level): + Level(s) to unstack, can pass level name. + + Returns: + DataFrame: Unstacked Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def argmax(self): """ Return int position of the smallest value in the Series. From fface576be669e75fd40d522abb8f0ca79589ae4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 18:02:14 -0500 Subject: [PATCH 11/32] feat: add `DataFrame.to_pandas_batches()` to download large `DataFrame` objects (#136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on https://togithub.com/googleapis/python-bigquery-dataframes/pull/132 Towards internal issue 280662868 🦕 --- bigframes/core/blocks.py | 29 +++++++++++-- bigframes/dataframe.py | 4 ++ bigframes/session/_io/pandas.py | 20 +++++++-- tests/system/small/test_dataframe_io.py | 8 ++++ tests/unit/session/test_io_pandas.py | 56 +++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e8a3968b3d..9db193a04e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -416,6 +416,30 @@ def to_pandas( ) return df, query_job + def to_pandas_batches(self): + """Download results one message at a time.""" + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + results_iterator, _ = self._expr.start_query() + for arrow_table in results_iterator.to_arrow_iterable( + bqstorage_client=self._expr._session.bqstoragereadclient + ): + df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + self._copy_index_to_pandas(df) + yield df + + def _copy_index_to_pandas(self, df: pd.DataFrame): + """Set the index on pandas DataFrame to match this block. + + Warning: This method modifies ``df`` inplace. + """ + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + # Pandas names is annotated as list[str] rather than the more + # general Sequence[Label] that BigQuery DataFrames has. + # See: https://github.com/pandas-dev/pandas-stubs/issues/804 + df.index.names = self.index.names # type: ignore + def _compute_and_count( self, value_keys: Optional[Iterable[str]] = None, @@ -489,10 +513,7 @@ def _compute_and_count( else: total_rows = results_iterator.total_rows df = self._to_dataframe(results_iterator) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore + self._copy_index_to_pandas(df) return df, total_rows, query_job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6b79f463ef..96c74de1cd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -895,6 +895,10 @@ def to_pandas( self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) + def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: + """Stream DataFrame results to an iterable of pandas DataFrame""" + return self._block.to_pandas_batches() + def _compute_dry_run(self) -> bigquery.QueryJob: return self._block._compute_dry_run() diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 163127b546..1af00a2d01 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -46,20 +46,32 @@ def arrow_to_pandas( # Preserve NA/NaN distinction. Note: This is currently needed, even if we use # nullable Float64Dtype in the types_mapper. See: # https://github.com/pandas-dev/pandas/issues/55668 + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, float("nan")) # Regarding type: ignore, this class has been public at this # location since pandas 1.2.0. See: # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html pd_array = pandas.arrays.FloatingArray( # type: ignore - column.to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif dtype == pandas.Int64Dtype(): # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly # casts to float64 in an intermediate step. + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, 0) pd_array = pandas.arrays.IntegerArray( - pyarrow.compute.fill_null(column, 0).to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif isinstance(dtype, pandas.ArrowDtype): diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d60083a837..8f5d706f62 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): + """Verify to_pandas_batches() APIs returns the expected dtypes.""" + expected = scalars_df_default_index.dtypes + for df in scalars_df_default_index.to_pandas_batches(): + actual = df.dtypes + pd.testing.assert_series_equal(actual, expected) + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 8b95977ec3..0f6f5dae03 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -231,6 +231,62 @@ ), id="scalar-dtypes", ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.chunked_array( + [[True, None], [True, False]], + type=pyarrow.bool_(), + ), + "bytes": pyarrow.chunked_array( + [[b"123", None], [b"abc", b"xyz"]], + type=pyarrow.binary(), + ), + "float": pyarrow.chunked_array( + [[1.0, None], [float("nan"), -1.0]], + type=pyarrow.float64(), + ), + "int": pyarrow.chunked_array( + [[1, None], [-1, 2**63 - 1]], + type=pyarrow.int64(), + ), + "string": pyarrow.chunked_array( + [["123", None], ["abc", "xyz"]], + type=pyarrow.string(), + ), + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes-chunked_array", + ), pytest.param( pyarrow.Table.from_pydict( { From bbc3c693bc291fc91f7c01b8cb837309732ef254 Mon Sep 17 00:00:00 2001 From: Bradford Orr <15842009+orrbradford@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:46:14 -0700 Subject: [PATCH 12/32] =?UTF-8?q?fix:=20resolve=20plotly=20rendering=20iss?= =?UTF-8?q?ue=20by=20using=20ipython=20html=20for=20job=20pro=E2=80=A6=20(?= =?UTF-8?q?#134)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …gress messages Fixes bug that was preventing plotly rendering to show after the progress bar. Original ipywidgets implementation isn't necessary for basic opening of urls Screen recording: https://togithub.com/googleapis/python-bigquery-dataframes/assets/15842009/5225ce05-117a-4808-9ff0-cb2c3aaf3a40 Internal bug: b/297062404 --- bigframes/formatting_helpers.py | 24 ++++--- tests/system/small/test_progress_bar.py | 83 +++++++++++-------------- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 6851bdd2bd..752aeb7a10 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -16,6 +16,7 @@ # TODO(orrbradford): cleanup up typings and documenttion in this file import datetime +import random from typing import Any, Optional, Union import google.api_core.exceptions as api_core_exceptions @@ -57,9 +58,9 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): Pywidget html table. """ if query_job is None: - return widgets.HTML("No job information available") + return display.HTML("No job information available") if query_job.dry_run: - return widgets.HTML( + return display.HTML( f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" ) table_html = "" @@ -125,16 +126,20 @@ def wait_for_query_job( Returns: A row iterator over the query results. """ - loading_bar = widgets.HTML(get_query_job_loading_html(query_job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_query_job_loading_html(query_job)) + display.display(loading_bar, display_id=display_id) query_result = query_job.result(max_results=max_results) query_job.reload() - loading_bar.value = get_query_job_loading_html(query_job) + display.update_display( + display.HTML(get_query_job_loading_html(query_job)), + display_id=display_id, + ) elif progress_bar == "terminal": initial_loading_bar = get_query_job_loading_string(query_job) print(initial_loading_bar) @@ -171,16 +176,19 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): progress_bar (str, Optional): Which progress bar to show. """ - loading_bar = widgets.HTML(get_base_job_loading_html(job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_base_job_loading_html(job)) + display.display(loading_bar, display_id=display_id) job.result() job.reload() - loading_bar.value = get_base_job_loading_html(job) + display.update_display( + display.HTML(get_base_job_loading_html(job)), display_id=display_id + ) elif progress_bar == "terminal": inital_loading_bar = get_base_job_loading_string(job) print(inital_loading_bar) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 00380c2639..f7fc4eaa8f 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re import tempfile import pandas as pd @@ -19,94 +20,84 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers +job_load_message_regex = r"\w+ job [\w-]+ is \w+\." + def test_progress_bar_dataframe( penguins_df_default_index: bf.dataframe.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert penguins_df_default_index.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" series = penguins_df_default_index["body_mass_g"].head(10) + capsys.readouterr() # clear output series.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert series.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index["body_mass_g"].head(10).mean() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output session.read_gbq(penguins_table_id) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" path = gcs_folder + "test_read_csv_progress_bar*.csv" + capsys.readouterr() # clear output penguins_df_default_index.to_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_load_jobs( session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" penguins_pandas_df_default_index.to_csv(path, index=False) + capsys.readouterr() # clear output session.read_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") + + assert_loading_msg_exist(capsys.readouterr().out) + + +def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): + numLoadingMsg = 0 + lines = capystOut.split("\n") lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 for line in lines: - assert html_check in line and open_job_check in line + if re.match(pattern, line) is not None: + numLoadingMsg += 1 + assert numLoadingMsg > 0 def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( False ) From a99d62c2e82f6db0786665470531d4b49438aabf Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 17:28:13 -0700 Subject: [PATCH 13/32] refactor: ArrayValue is now a tree that defers conversion to ibis (#110) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/__init__.py | 1221 +++-------------- bigframes/core/block_transforms.py | 25 +- bigframes/core/blocks.py | 34 +- bigframes/core/compile/__init__.py | 21 + bigframes/core/compile/compiled.py | 1121 +++++++++++++++ bigframes/core/compile/compiler.py | 185 +++ .../core/{joins => compile}/row_identity.py | 14 +- .../core/{joins => compile}/single_column.py | 35 +- bigframes/core/groupby/__init__.py | 20 +- bigframes/core/indexers.py | 6 +- bigframes/core/indexes/index.py | 15 +- bigframes/core/joins/__init__.py | 9 +- bigframes/core/nodes.py | 245 ++++ bigframes/core/ordering.py | 4 +- bigframes/core/window_spec.py | 35 + bigframes/dataframe.py | 18 +- bigframes/ml/metrics.py | 2 +- bigframes/operations/base.py | 4 +- bigframes/series.py | 34 +- bigframes/session/__init__.py | 44 +- tests/system/small/test_progress_bar.py | 4 +- tests/system/small/test_series.py | 4 +- tests/system/small/test_session.py | 9 +- tests/unit/core/test_blocks.py | 5 +- tests/unit/resources.py | 17 +- tests/unit/test_core.py | 37 +- 26 files changed, 1996 insertions(+), 1172 deletions(-) create mode 100644 bigframes/core/compile/__init__.py create mode 100644 bigframes/core/compile/compiled.py create mode 100644 bigframes/core/compile/compiler.py rename bigframes/core/{joins => compile}/row_identity.py (94%) rename bigframes/core/{joins => compile}/single_column.py (87%) create mode 100644 bigframes/core/nodes.py create mode 100644 bigframes/core/window_spec.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 6c78a07f3b..4653f0ab6a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -14,29 +14,21 @@ from __future__ import annotations from dataclasses import dataclass -import functools -import math -import textwrap +import io import typing -from typing import Collection, Iterable, Literal, Optional, Sequence, Tuple +from typing import Iterable, Literal, Optional, Sequence, Tuple from google.cloud import bigquery import ibis -import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants +import bigframes.core.compile as compiled import bigframes.core.guid -from bigframes.core.ordering import ( - encode_order_string, - ExpressionOrdering, - IntegerEncoding, - OrderingColumnReference, - reencode_order_string, - StringEncoding, -) -import bigframes.core.utils as utils +import bigframes.core.nodes as nodes +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.ordering as orderings +from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -49,470 +41,190 @@ @dataclass(frozen=True) -class WindowSpec: +class ArrayValue: """ - Specifies a window over which aggregate and analytic function may be applied. - grouping_keys: set of column ids to group on - preceding: Number of preceding rows in the window - following: Number of preceding rows in the window - ordering: List of columns ids and ordering direction to override base ordering + ArrayValue is an immutable type representing a 2D array with per-column types. """ - grouping_keys: typing.Sequence[str] = tuple() - ordering: typing.Sequence[OrderingColumnReference] = tuple() - preceding: typing.Optional[int] = None - following: typing.Optional[int] = None - min_periods: int = 0 - - -# TODO(swast): We might want to move this to it's own sub-module. -class ArrayValue: - """Immutable BigQuery DataFrames expression tree. - - Note: Usage of this class is considered to be private and subject to change - at any time. + node: nodes.BigFrameNode - This class is a wrapper around Ibis expressions. Its purpose is to defer - Ibis projection operations to keep generated SQL small and correct when - mixing and matching columns from different versions of a DataFrame. - - Args: - session: - A BigQuery DataFrames session to allow more flexibility in running - queries. - table: An Ibis table expression. - columns: Ibis value expressions that can be projected as columns. - hidden_ordering_columns: Ibis value expressions to store ordering. - ordering: An ordering property of the data frame. - predicates: A list of filters on the data frame. - """ - - def __init__( - self, + @classmethod + def from_ibis( + cls, session: Session, table: ibis_types.Table, columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: ExpressionOrdering = ExpressionOrdering(), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + hidden_ordering_columns: Sequence[ibis_types.Value], + ordering: orderings.ExpressionOrdering, ): - self._session = session - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () - # TODO: Validate ordering - if not ordering.total_ordering_columns: - raise ValueError("Must have total ordering defined by one or more columns") - self._ordering = ordering - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns - self._hidden_ordering_columns = ( - tuple(hidden_ordering_columns) - if hidden_ordering_columns is not None - else () - ) - - # To allow for more efficient lookup by column name, create a - # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} - self._hidden_ordering_column_names = { - column.get_name(): column for column in self._hidden_ordering_columns - } - ### Validation - value_col_ids = self._column_names.keys() - hidden_col_ids = self._hidden_ordering_column_names.keys() - - all_columns = value_col_ids | hidden_col_ids - ordering_valid = all( - col.column_id in all_columns for col in ordering.all_ordering_columns + node = nodes.ReadGbqNode( + table=table, + table_session=session, + columns=tuple(columns), + hidden_ordering_columns=tuple(hidden_ordering_columns), + ordering=ordering, ) - if value_col_ids & hidden_col_ids: - raise ValueError( - f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" - ) - if not ordering_valid: - raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + return cls(node) @classmethod - def mem_expr_from_pandas( - cls, - pd_df: pandas.DataFrame, - session: Optional[Session], - ) -> ArrayValue: - """ - Builds an in-memory only (SQL only) expr from a pandas dataframe. + def from_pandas(cls, pd_df: pandas.DataFrame): + iobytes = io.BytesIO() + # Discard row labels and use simple string ids for columns + column_ids = tuple(str(label) for label in pd_df.columns) + pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes) + node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids) + return cls(node) - Caution: If session is None, only a subset of expr functionality will - be available (null Session is usually not supported). - """ - # We can't include any hidden columns in the ArrayValue constructor, so - # grab the column names before we add the hidden ordering column. - column_names = [str(column) for column in pd_df.columns] - # Make sure column names are all strings. - pd_df = pd_df.set_axis(column_names, axis="columns") - pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) - - # ibis memtable cannot handle NA, must convert to None - pd_df = pd_df.astype("object") # type: ignore - pd_df = pd_df.where(pandas.notnull(pd_df), None) + @property + def column_ids(self) -> typing.Sequence[str]: + return self.compile().column_ids - # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. - keys_memtable = ibis.memtable(pd_df) - schema = keys_memtable.schema() - new_schema = [] - for column_index, column in enumerate(schema): - if column == ORDER_ID_COLUMN: - new_type: ibis_dtypes.DataType = ibis_dtypes.int64 - else: - column_type = schema[column] - # The autodetected type might not be one we can support, such - # as NULL type for empty rows, so convert to a type we do - # support. - new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) - ) - # TODO(swast): Ibis memtable doesn't use backticks in struct - # field names, so spaces and other characters aren't allowed in - # the memtable context. Blocked by - # https://github.com/ibis-project/ibis/issues/7187 - column = f"col_{column_index}" - new_schema.append((column, new_type)) + @property + def session(self) -> Session: + required_session = self.node.session + from bigframes import get_global_session - # must set non-null column labels. these are not the user-facing labels - pd_df = pd_df.set_axis( - [column for column, _ in new_schema], - axis="columns", - ) - keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + return self.node.session[0] if required_session else get_global_session() - return cls( - session, # type: ignore # Session cannot normally be none, see "caution" above - keys_memtable, - columns=[ - keys_memtable[f"col_{column_index}"].name(column) - for column_index, column in enumerate(column_names) - ], - ordering=ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - ), - hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), - ) - - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + return self.compile().get_column_type(key) - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) + def compile(self) -> compiled.CompiledArrayValue: + return compiled.compile_node(self.node) - @property - def _hidden_column_ids(self) -> typing.Sequence[str]: - return tuple(self._hidden_ordering_column_names.keys()) + def shape(self) -> typing.Tuple[int, int]: + """Returns dimensions as (length, width) tuple.""" + width = len(self.compile().columns) + count_expr = self.compile()._to_ibis_expr("unordered").count() - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None + # Support in-memory engines for hermetic unit tests. + if not self.node.session: + try: + length = ibis.pandas.connect({}).execute(count_expr) + return (length, width) + except Exception: + # Not all cases can be handled by pandas engine + pass + + sql = self.session.ibis_client.compile(count_expr) + row_iterator, _ = self.session._start_query( + sql=sql, + max_results=1, ) + length = next(row_iterator)[0] + return (length, width) - @property - def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + return self.compile().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, ) - def builder(self) -> ArrayValueBuilder: - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( - self._session, - self._table, - columns=self._columns, - hidden_ordering_columns=self._hidden_ordering_columns, - ordering=self._ordering, - predicates=self._predicates, + def start_query( + self, + job_config: Optional[bigquery.job.QueryJobConfig] = None, + max_results: Optional[int] = None, + *, + sorted: bool = True, + ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """Execute a query and return metadata about the results.""" + # TODO(swast): Cache the job ID so we can look it up again if they ask + # for the results? We'd need a way to invalidate the cache if DataFrame + # becomes mutable, though. Or move this method to the immutable + # expression class. + # TODO(swast): We might want to move this method to Session and/or + # provide our own minimal metadata class. Tight coupling to the + # BigQuery client library isn't ideal, especially if we want to support + # a LocalSession for unit testing. + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + sql = self.to_sql(sorted=sorted) # type:ignore + return self.session._start_query( + sql=sql, + job_config=job_config, + max_results=max_results, ) - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) - - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: + """Write the ArrayValue to a session table and create a new block object that references it.""" + compiled = self.compile() + ibis_expr = compiled._to_ibis_expr("unordered", expose_hidden_cols=True) + destination = self.session._ibis_to_session_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + table_expression = self.session.ibis_client.table( + f"{destination.project}.{destination.dataset_id}.{destination.table_id}" + ) + new_columns = [table_expression[column] for column in compiled.column_ids] + new_hidden_columns = [ + table_expression[column] + for column in compiled._hidden_ordering_column_names + ] + return ArrayValue.from_ibis( + self.session, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=compiled._ordering, ) - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) - - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) + # Operations - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + return ArrayValue( + nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) + ) def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast( - ibis_types.BooleanValue, self._get_ibis_column(predicate_id) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), + return ArrayValue( + nodes.FilterNode( + child=self.node, predicate_id=predicate_id, keep_null=keep_null ) - return self._filter(condition) - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> ArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() + ) def order_by( self, by: Sequence[OrderingColumnReference], stable: bool = False ) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) - return expr_builder.build() - - def reversed(self) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_reverse() - return expr_builder.build() - - def _uniform_sampling(self, fraction: float) -> ArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction - ) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, + nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable) ) - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> ArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> ArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() + def reversed(self) -> ArrayValue: + return ArrayValue(nodes.ReversedNode(child=self.node)) def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ - # Special case: offsets already exist - ordering = self._ordering - - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) - expr_builder = self.builder() - expr_builder.columns = [ - self._get_any_column(ordering.total_order_col.column_id).name(col_id), - *self.columns, - ] - return expr_builder.build() + return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return self._projection( - [self._get_ibis_column(col_id) for col_id in column_ids] + return ArrayValue( + nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) ) - def _projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: - """Creates a new expression based on this expression with new columns.""" - # TODO(swast): We might want to do validation here that columns derive - # from the same table expression instead of (in addition to?) at - # construction time. - - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - - def shape(self) -> typing.Tuple[int, int]: - """Returns dimensions as (length, width) tuple.""" - width = len(self.columns) - count_expr = self._to_ibis_expr("unordered").count() - sql = self._session.ibis_client.compile(count_expr) - - # Support in-memory engines for hermetic unit tests. - if not isinstance(sql, str): - length = self._session.ibis_client.execute(count_expr) - else: - row_iterator, _ = self._session._start_query( - sql=sql, - max_results=1, - ) - length = next(row_iterator)[0] - return (length, width) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) return ArrayValue( - self._session, - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name + return ArrayValue( + nodes.ProjectUnaryOpNode( + child=self.node, input_id=column_name, op=op, output_id=output_name + ) ) - return self._set_or_replace_by_id(output_name or column_name, value) def project_binary_op( self, @@ -522,11 +234,15 @@ def project_binary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectBinaryOpNode( + child=self.node, + left_input_id=left_column_id, + right_input_id=right_column_id, + op=op, + output_id=output_column_id, + ) + ) def project_ternary_op( self, @@ -537,12 +253,16 @@ def project_ternary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectTernaryOpNode( + child=self.node, + input_id1=col_id_1, + input_id2=col_id_2, + input_id3=col_id_3, + op=op, + output_id=output_column_id, + ) + ) def aggregate( self, @@ -557,46 +277,14 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: agg_op._as_ibis(table[col_in]) - for col_in, agg_op, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - [ - OrderingColumnReference(column_id=column_id) - for column_id in by_column_ids - ], - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = ArrayValue(self._session, result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=tuple(aggregations), + by_column_ids=tuple(by_column_ids), + dropna=dropna, ) + ) def corr_aggregate( self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] @@ -607,25 +295,8 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.CorrNode(child=self.node, corr_aggregations=tuple(corr_aggregations)) ) def project_window_op( @@ -647,231 +318,17 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ - column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) - window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) - - window_op = op._as_ibis(column, window) - - clauses = [] - if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis.NA)) - if window_spec.min_periods: - if op.skips_nulls: - # Most operations do not count NULL values towards min_periods - observation_count = agg_ops.count_op._as_ibis(column, window) - else: - # Operations like count treat even NULLs as valid observations for the sake of min_periods - # notnull is just used to convert null values to non-null (FALSE) values to be counted - denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) - observation_count = agg_ops.count_op._as_ibis(denulled_value, window) - clauses.append( - ( - observation_count < ibis_types.literal(window_spec.min_periods), - ibis.NA, - ) - ) - if clauses: - case_statement = ibis.case() - for clause in clauses: - case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement - - result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result - - def to_sql( - self, - offset_column: typing.Optional[str] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, - ) -> str: - offsets_id = offset_column or ORDER_ID_COLUMN - - sql = self._session.ibis_client.compile( - self._to_ibis_expr( - ordering_mode="offset_col" - if (offset_column or sorted) - else "unordered", - order_col_name=offsets_id, - col_id_overrides=col_id_overrides, - ) - ) - if sorted: - sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ - ) - return typing.cast(str, sql) - - def _to_ibis_expr( - self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, - expose_hidden_cols: bool = False, - fraction: Optional[float] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - ): - """ - Creates an Ibis table expression representing the DataFrame. - - ArrayValue objects are sorted, so the following options are available - to reflect this in the ibis expression. - - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. - * "string_encoded": An ordered string column is provided in output table. - * "unordered": No ordering information will be provided in output. Only - value columns are projected. - - For offset or ordered column, order_col_name can be used to assign the - output label for the ordering column. If none is specified, the default - column name will be 'bigframes_ordering_id' - - Args: - ordering_mode: - How to construct the Ibis expression from the ArrayValue. See - above for details. - order_col_name: - If the ordering mode outputs a single ordering or offsets - column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. - col_id_overrides: - overrides the column ids for the result - Returns: - An ibis expression representing the data help by the ArrayValue object. - """ - assert ordering_mode in ( - "string_encoded", - "offset_col", - "unordered", - ) - if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): - raise ValueError( - f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + return ArrayValue( + nodes.WindowOpNode( + child=self.node, + column_name=column_name, + op=op, + window_spec=window_spec, + output_name=output_name, + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, ) - - columns = list(self._columns) - columns_to_drop: list[ - str - ] = [] # Ordering/Filtering columns that will be dropped at end - - if self._reduced_predicate is not None: - columns.append(self._reduced_predicate) - # Usually drop predicate as it is will be all TRUE after filtering - if not expose_hidden_cols: - columns_to_drop.append(self._reduced_predicate.get_name()) - - order_columns = self._create_order_columns( - ordering_mode, order_col_name, expose_hidden_cols ) - columns.extend(order_columns) - - # Special case for empty tables, since we can't create an empty - # projection. - if not columns: - return ibis.memtable([]) - - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns - ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table - - def _create_order_columns( - self, - ordering_mode: str, - order_col_name: Optional[str], - expose_hidden_cols: bool, - ) -> typing.Sequence[ibis_types.Value]: - # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "offset_col": - return (self._create_offset_column().name(order_col_name),) - elif ordering_mode == "string_encoded": - return (self._create_string_ordering_column().name(order_col_name),) - elif expose_hidden_cols: - return self._hidden_ordering_columns - return () - - def _create_offset_column(self) -> ibis_types.IntegerColumn: - if self._ordering.total_order_col and self._ordering.is_sequential: - offsets = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, offsets) - else: - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - offsets = ibis.row_number().over(window) - return typing.cast(ibis_types.IntegerColumn, offsets) - - def _create_string_ordering_column(self) -> ibis_types.StringColumn: - if self._ordering.total_order_col and self._ordering.is_string_encoded: - string_order_ids = self._get_any_column( - self._ordering.total_order_col.column_id - ) - return typing.cast(ibis_types.StringColumn, string_order_ids) - if ( - self._ordering.total_order_col - and self._ordering.integer_encoding.is_encoded - ): - # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers - int_values = self._get_any_column(self._ordering.total_order_col.column_id) - return encode_order_string( - typing.cast(ibis_types.IntegerColumn, int_values), - ) - else: - # Have to build string from scratch - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - row_nums = typing.cast( - ibis_types.IntegerColumn, ibis.row_number().over(window) - ) - return encode_order_string(row_nums) - - def start_query( - self, - job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, - *, - sorted: bool = True, - ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - """Execute a query and return metadata about the results.""" - # TODO(swast): Cache the job ID so we can look it up again if they ask - # for the results? We'd need a way to invalidate the cache if DataFrame - # becomes mutable, though. Or move this method to the immutable - # expression class. - # TODO(swast): We might want to move this method to Session and/or - # provide our own minimal metadata class. Tight coupling to the - # BigQuery client library isn't ideal, especially if we want to support - # a LocalSession for unit testing. - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - sql = self.to_sql(sorted=True) # type:ignore - return self._session._start_query( - sql=sql, - job_config=job_config, - max_results=max_results, - ) - - def _get_table_size(self, destination_table): - return self._session._get_table_size(destination_table) def _reproject_to_table(self) -> ArrayValue: """ @@ -881,74 +338,25 @@ def _reproject_to_table(self) -> ArrayValue: some operations such as window operations that cannot be used recursively in projections. """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): - group_by: typing.List[ibis_types.Value] = ( - [ - typing.cast( - ibis_types.Column, _as_identity(self._get_ibis_column(column)) - ) - for column in window_spec.grouping_keys - ] - if window_spec.grouping_keys - else [] - ) - if self._reduced_predicate is not None: - group_by.append(self._reduced_predicate) - if window_spec.ordering: - order_by = _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - window_spec.ordering, + nodes.ReprojectOpNode( + child=self.node, ) - if not allow_ties: - # Most operator need an unambiguous ordering, so the table's total ordering is appended - order_by = tuple([*order_by, *self._ibis_order]) - elif (window_spec.following is not None) or (window_spec.preceding is not None): - # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - order_by = tuple(self._ibis_order) - else: - # Unbound grouping window. Suitable for aggregations but not for analytic function application. - order_by = None - return ibis.window( - preceding=window_spec.preceding, - following=window_spec.following, - order_by=order_by, - group_by=group_by, ) def unpivot( self, row_labels: typing.Sequence[typing.Hashable], unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]] ], *, passthrough_columns: typing.Sequence[str] = (), index_col_ids: typing.Sequence[str] = ["index"], dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] ] = pandas.Float64Dtype(), - how="left", + how: typing.Literal["left", "right"] = "left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -963,133 +371,23 @@ def unpivot( Returns: ArrayValue: The unpivoted ArrayValue """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] return ArrayValue( - session=self._session, - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, + nodes.UnpivotNode( + child=self.node, + row_labels=tuple(row_labels), + unpivot_columns=tuple(unpivot_columns), + passthrough_columns=tuple(passthrough_columns), + index_col_ids=tuple(index_col_ids), + dtype=dtype, + how=how, + ) ) def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) + return ArrayValue( + nodes.AssignNode( + child=self.node, source_id=source_id, destination_id=destination_id + ) ) def assign_constant( @@ -1098,128 +396,41 @@ def assign_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: - """Write the ArrayValue to a session table and create a new block object that references it.""" - ibis_expr = self._to_ibis_expr("unordered", expose_hidden_cols=True) - destination = self._session._ibis_to_session_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cache" - ) - table_expression = self._session.ibis_client.table( - f"{destination.project}.{destination.dataset_id}.{destination.table_id}" - ) - new_columns = [table_expression[column] for column in self.column_ids] - new_hidden_columns = [ - table_expression[column] for column in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, - ordering=self._ordering, + nodes.AssignConstantNode( + child=self.node, destination_id=destination_id, value=value, dtype=dtype + ) ) - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( + def join( self, - session: Session, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + self_column_ids: typing.Sequence[str], + other: ArrayValue, + other_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + ], + allow_row_identity_join: bool = True, ): - self.session = session - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> ArrayValue: return ArrayValue( - session=self.session, - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) - - -def _reduce_predicate_list( - predicate_list: typing.Collection[ibis_types.BooleanValue], -) -> ibis_types.BooleanValue: - """Converts a list of predicates BooleanValues into a single BooleanValue.""" - if len(predicate_list) == 0: - raise ValueError("Cannot reduce empty list of predicates") - if len(predicate_list) == 1: - (item,) = predicate_list - return item - return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) - - -def _convert_ordering_to_table_values( - value_lookup: typing.Mapping[str, ibis_types.Value], - ordering_columns: typing.Sequence[OrderingColumnReference], -) -> typing.Sequence[ibis_types.Value]: - column_refs = ordering_columns - ordering_values = [] - for ordering_col in column_refs: - column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) - ordering_value = ( - ibis.asc(column) - if ordering_col.direction.is_ascending - else ibis.desc(column) + nodes.JoinNode( + left_child=self.node, + right_child=other.node, + left_column_ids=tuple(self_column_ids), + right_column_ids=tuple(other_column_ids), + how=how, + allow_row_identity_join=allow_row_identity_join, + ) ) - # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. - if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): - # Force nulls to be first - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.desc(is_null_val)) - elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): - # Force nulls to be last - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.asc(is_null_val)) - ordering_values.append(ordering_value) - return ordering_values + def _uniform_sampling(self, fraction: float) -> ArrayValue: + """Sampling the table on given fraction. -def _as_identity(value: ibis_types.Value): - # Some types need to be converted to string to enable groupby - if value.type().is_float64() or value.type().is_geospatial(): - return value.cast(ibis_dtypes.str) - return value + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index b0f05f4798..3706bf1681 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -21,6 +21,7 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering +import bigframes.core.window_spec as windows import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -68,21 +69,21 @@ def indicate_duplicates( if keep == "first": # Count how many copies occur up to current copy of value # Discard this value if there are copies BEFORE - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), following=0, ) elif keep == "last": # Count how many copies occur up to current copy of values # Discard this value if there are copies AFTER - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), preceding=0, ) else: # keep == False # Count how many copies of the value occur in entire series. # Discard this value if there are copies ANYWHERE - window_spec = core.WindowSpec(grouping_keys=tuple(columns)) + window_spec = windows.WindowSpec(grouping_keys=tuple(columns)) block, dummy = block.create_constant(1) block, val_count_col_id = block.apply_window_op( dummy, @@ -131,7 +132,7 @@ def value_counts( ) count_id = agg_ids[0] if normalize: - unbound_window = core.WindowSpec() + unbound_window = windows.WindowSpec() block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) @@ -153,7 +154,7 @@ def value_counts( def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: column_labels = block.column_labels - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -195,7 +196,7 @@ def rank( ops.isnull_op, ) nullity_col_ids.append(nullity_col_id) - window = core.WindowSpec( + window = windows.WindowSpec( # BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first. ordering=( ordering.OrderingColumnReference( @@ -229,7 +230,7 @@ def rank( block, result_id = block.apply_window_op( rownum_col_ids[i], agg_op, - window_spec=core.WindowSpec(grouping_keys=[columns[i]]), + window_spec=windows.WindowSpec(grouping_keys=(columns[i],)), skip_reproject_unsafe=(i < (len(columns) - 1)), ) post_agg_rownum_col_ids.append(result_id) @@ -311,7 +312,7 @@ def nsmallest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -343,7 +344,7 @@ def nlargest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -440,14 +441,14 @@ def _mean_delta_to_power( grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis.""" - window = core.WindowSpec(grouping_keys=grouping_column_ids) + window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns(delta_id) + block = block.drop_columns([delta_id]) delta_ids.append(delta_power_id) return block, delta_ids @@ -645,7 +646,7 @@ def _idx_extrema( for idx_col in original_block.index_columns ], ] - window_spec = core.WindowSpec(ordering=order_refs) + window_spec = windows.WindowSpec(ordering=tuple(order_refs)) idx_col = original_block.index_columns[0] block, result_col = block.apply_window_op( idx_col, agg_ops.first_op, window_spec diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9db193a04e..cc13edeaf9 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -35,7 +35,6 @@ import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes -import bigframes.core.joins as joins import bigframes.core.joins.name_resolution as join_names import bigframes.core.ordering as ordering import bigframes.core.utils @@ -378,7 +377,7 @@ def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - return self._expr._session._rows_to_dataframe(result, dtypes) + return self._expr.session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -422,7 +421,7 @@ def to_pandas_batches(self): dtypes.update(zip(self.value_columns, self.dtypes)) results_iterator, _ = self._expr.start_query() for arrow_table in results_iterator.to_arrow_iterable( - bqstorage_client=self._expr._session.bqstoragereadclient + bqstorage_client=self._expr.session.bqstoragereadclient ): df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) self._copy_index_to_pandas(df) @@ -454,7 +453,9 @@ def _compute_and_count( results_iterator, query_job = expr.start_query(max_results=max_results) - table_size = expr._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + table_size = ( + expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + ) fraction = ( max_download_size / table_size if (max_download_size is not None) and (table_size != 0) @@ -819,7 +820,9 @@ def aggregate_all_and_stack( axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, - dtype=pd.Float64Dtype(), + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = pd.Float64Dtype(), ) -> Block: axis_n = utils.get_axis_number(axis) if axis_n == 0: @@ -829,7 +832,7 @@ def aggregate_all_and_stack( result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), index_col_ids=["index"], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), dtype=dtype, ) return Block(result_expr, index_columns=["index"], column_labels=[None]) @@ -841,7 +844,7 @@ def aggregate_all_and_stack( stacked_expr = expr_with_offsets.unpivot( row_labels=self.column_labels.to_list(), index_col_ids=[guid.generate_guid()], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=[(value_col_id, tuple(self.value_columns))], passthrough_columns=[*self.index_columns, offset_col], dtype=dtype, ) @@ -1029,13 +1032,13 @@ def summarize( for col_id in column_ids ] columns = [ - (col_id, [f"{col_id}-{stat.name}" for stat in stats]) + (col_id, tuple(f"{col_id}-{stat.name}" for stat in stats)) for col_id in column_ids ] expr = self.expr.aggregate(aggregations).unpivot( labels, - unpivot_columns=columns, - index_col_ids=[label_col_id], + unpivot_columns=tuple(columns), + index_col_ids=tuple([label_col_id]), ) labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) @@ -1342,7 +1345,7 @@ def stack(self, how="left", levels: int = 1): passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, index_col_ids=added_index_columns, - dtype=dtypes, + dtype=tuple(dtypes), how=how, ) new_index_level_names = self.column_labels.names[-levels:] @@ -1382,7 +1385,7 @@ def _create_stack_column( dtype = self._column_type(input_id) input_columns.append(input_id) # Input column i is the first one that - return input_columns, dtype or pd.Float64Dtype() + return tuple(input_columns), dtype or pd.Float64Dtype() def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) @@ -1497,8 +1500,7 @@ def merge( sort: bool, suffixes: tuple[str, str] = ("_x", "_y"), ) -> Block: - joined_expr = joins.join_by_column( - self.expr, + joined_expr = self.expr.join( left_join_ids, other.expr, right_join_ids, @@ -1708,7 +1710,7 @@ def _is_monotonic( return result -def block_from_local(data, session=None) -> Block: +def block_from_local(data) -> Block: pd_data = pd.DataFrame(data) columns = pd_data.columns @@ -1730,7 +1732,7 @@ def block_from_local(data, session=None) -> Block: ) index_ids = pd_data.columns[: len(index_labels)] - keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) + keys_expr = core.ArrayValue.from_pandas(pd_data) return Block( keys_expr, column_labels=columns, diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py new file mode 100644 index 0000000000..c86f4463dc --- /dev/null +++ b/bigframes/core/compile/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.compile.compiled import CompiledArrayValue +from bigframes.core.compile.compiler import compile_node + +__all__ = [ + "compile_node", + "CompiledArrayValue", +] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py new file mode 100644 index 0000000000..1134f1aab0 --- /dev/null +++ b/bigframes/core/compile/compiled.py @@ -0,0 +1,1121 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import math +import textwrap +import typing +from typing import Collection, Iterable, Literal, Optional, Sequence + +import ibis +import ibis.backends.bigquery as ibis_bigquery +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.types as ibis_types +import pandas + +import bigframes.constants as constants +import bigframes.core.guid +from bigframes.core.ordering import ( + encode_order_string, + ExpressionOrdering, + IntegerEncoding, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) +import bigframes.core.utils as utils +from bigframes.core.window_spec import WindowSpec +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +ORDER_ID_COLUMN = "bigframes_ordering_id" +PREDICATE_COLUMN = "bigframes_predicate" + + +class CompiledArrayValue: + """Immutable BigQuery DataFrames expression tree. + + Note: Usage of this class is considered to be private and subject to change + at any time. + + This class is a wrapper around Ibis expressions. Its purpose is to defer + Ibis projection operations to keep generated SQL small and correct when + mixing and matching columns from different versions of a DataFrame. + + Args: + table: An Ibis table expression. + columns: Ibis value expressions that can be projected as columns. + hidden_ordering_columns: Ibis value expressions to store ordering. + ordering: An ordering property of the data frame. + predicates: A list of filters on the data frame. + """ + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, + ordering: ExpressionOrdering = ExpressionOrdering(), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # TODO: Validate ordering + if not ordering.total_ordering_columns: + raise ValueError("Must have total ordering defined by one or more columns") + self._ordering = ordering + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + + # Meta columns store ordering, or other data that doesn't correspond to dataframe columns + self._hidden_ordering_columns = ( + tuple(hidden_ordering_columns) + if hidden_ordering_columns is not None + else () + ) + + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + self._hidden_ordering_column_names = { + column.get_name(): column for column in self._hidden_ordering_columns + } + ### Validation + value_col_ids = self._column_names.keys() + hidden_col_ids = self._hidden_ordering_column_names.keys() + + all_columns = value_col_ids | hidden_col_ids + ordering_valid = all( + col.column_id in all_columns for col in ordering.all_ordering_columns + ) + if value_col_ids & hidden_col_ids: + raise ValueError( + f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + ) + if not ordering_valid: + raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + + @classmethod + def mem_expr_from_pandas( + cls, + pd_df: pandas.DataFrame, + ) -> CompiledArrayValue: + """ + Builds an in-memory only (SQL only) expr from a pandas dataframe. + """ + # We can't include any hidden columns in the ArrayValue constructor, so + # grab the column names before we add the hidden ordering column. + column_names = [str(column) for column in pd_df.columns] + # Make sure column names are all strings. + pd_df = pd_df.set_axis(column_names, axis="columns") + pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + + # ibis memtable cannot handle NA, must convert to None + pd_df = pd_df.astype("object") # type: ignore + pd_df = pd_df.where(pandas.notnull(pd_df), None) + + # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. + keys_memtable = ibis.memtable(pd_df) + schema = keys_memtable.schema() + new_schema = [] + for column_index, column in enumerate(schema): + if column == ORDER_ID_COLUMN: + new_type: ibis_dtypes.DataType = ibis_dtypes.int64 + else: + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" + new_schema.append((column, new_type)) + + # must set non-null column labels. these are not the user-facing labels + pd_df = pd_df.set_axis( + [column for column, _ in new_schema], + axis="columns", + ) + keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + + return cls( + keys_memtable, + columns=[ + keys_memtable[f"col_{column_index}"].name(column) + for column_index, column in enumerate(column_names) + ], + ordering=ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + ), + hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), + ) + + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _hidden_column_ids(self) -> typing.Sequence[str]: + return tuple(self._hidden_ordering_column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + + @property + def _ibis_order(self) -> Sequence[ibis_types.Value]: + """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" + return _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + self._ordering.all_ordering_columns, + ) + + def builder(self) -> ArrayValueBuilder: + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return ArrayValueBuilder( + self._table, + columns=self._columns, + hidden_ordering_columns=self._hidden_ordering_columns, + ordering=self._ordering, + predicates=self._predicates, + ) + + def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: + # Must generate offsets if we are dropping a column that ordering depends on + expr = self + for ordering_column in set(columns).intersection( + [col.column_id for col in self._ordering.ordering_value_columns] + ): + expr = self._hide_column(ordering_column) + + expr_builder = expr.builder() + remain_cols = [ + column for column in expr.columns if column.get_name() not in columns + ] + expr_builder.columns = remain_cols + return expr_builder.build() + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: + raise ValueError( + "Column name {} not in set of values: {}".format(key, self.column_ids) + ) + return typing.cast(ibis_types.Value, self._column_names[key]) + + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def order_by( + self, by: Sequence[OrderingColumnReference], stable: bool = False + ) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + return expr_builder.build() + + def reversed(self) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_reverse() + return expr_builder.build() + + def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr( + "unordered", expose_hidden_cols=True, fraction=fraction + ) + columns = [table[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + table[column_name] for column_name in self._hidden_ordering_column_names + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + @property + def _offsets(self) -> ibis_types.IntegerColumn: + if not self._ordering.is_sequential: + raise ValueError( + "Expression does not have offsets. Generate them first using project_offsets." + ) + if not self._ordering.total_order_col: + raise ValueError( + "Ordering is invalid. Marked as sequential but no total order columns." + ) + column = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, column) + + def _project_offsets(self) -> CompiledArrayValue: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _hide_column(self, column_id) -> CompiledArrayValue: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def promote_offsets(self, col_id: str) -> CompiledArrayValue: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + # Special case: offsets already exist + ordering = self._ordering + + if (not ordering.is_sequential) or (not ordering.total_order_col): + return self._project_offsets().promote_offsets(col_id) + expr_builder = self.builder() + expr_builder.columns = [ + self._get_any_column(ordering.total_order_col.column_id).name(col_id), + *self.columns, + ] + return expr_builder.build() + + def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + expr = self + for ordering_column in set(self.column_ids).intersection( + [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: + """Append together multiple ArrayValue objects.""" + if len(other) == 0: + return self + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + self._ordering.string_encoding.length, + *[expression._ordering.string_encoding.length for expression in other], + ) + for i, expr in enumerate([self, *other]): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string( + table[ORDER_ID_COLUMN], max_encoding_size + ) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return CompiledArrayValue( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_unary_op( + self, column_name: str, op: ops.UnaryOp, output_name=None + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with unary operation applied to one column.""" + value = op._as_ibis(self._get_ibis_column(column_name)).name( + output_name or column_name + ) + return self._set_or_replace_by_id(output_name or column_name, value) + + def project_binary_op( + self, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> CompiledArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: agg_op._as_ibis(table[col_in]) + for col_in, agg_op, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple( + [ + OrderingColumnReference(column_id=column_id) + for column_id in by_column_ids + ] + ), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = CompiledArrayValue(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> CompiledArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> CompiledArrayValue: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) + window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) + + window_op = op._as_ibis(column, window) + + clauses = [] + if op.skips_nulls and not never_skip_nulls: + clauses.append((column.isnull(), ibis.NA)) + if window_spec.min_periods: + if op.skips_nulls: + # Most operations do not count NULL values towards min_periods + observation_count = agg_ops.count_op._as_ibis(column, window) + else: + # Operations like count treat even NULLs as valid observations for the sake of min_periods + # notnull is just used to convert null values to non-null (FALSE) values to be counted + denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + observation_count = agg_ops.count_op._as_ibis(denulled_value, window) + clauses.append( + ( + observation_count < ibis_types.literal(window_spec.min_periods), + ibis.NA, + ) + ) + if clauses: + case_statement = ibis.case() + for clause in clauses: + case_statement = case_statement.when(clause[0], clause[1]) + case_statement = case_statement.else_(window_op).end() + window_op = case_statement + + result = self._set_or_replace_by_id(output_name or column_name, window_op) + # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. + return result._reproject_to_table() if not skip_reproject_unsafe else result + + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + offsets_id = offset_column or ORDER_ID_COLUMN + + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + ordering_mode="offset_col" + if (offset_column or sorted) + else "unordered", + order_col_name=offsets_id, + col_id_overrides=col_id_overrides, + ) + ) + if sorted: + sql = textwrap.dedent( + f""" + SELECT * EXCEPT (`{offsets_id}`) + FROM ({sql}) + ORDER BY `{offsets_id}` + """ + ) + return typing.cast(str, sql) + + def _to_ibis_expr( + self, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + ): + """ + Creates an Ibis table expression representing the DataFrame. + + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. + + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. + + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' + + Args: + ordering_mode: + How to construct the Ibis expression from the ArrayValue. See + above for details. + order_col_name: + If the ordering mode outputs a single ordering or offsets + column, use this as the column name. + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + assert ordering_mode in ( + "string_encoded", + "offset_col", + "unordered", + ) + if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): + raise ValueError( + f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + ) + + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end + + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) + + order_columns = self._create_order_columns( + ordering_mode, order_col_name, expose_hidden_cols + ) + columns.extend(order_columns) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def _create_order_columns( + self, + ordering_mode: str, + order_col_name: Optional[str], + expose_hidden_cols: bool, + ) -> typing.Sequence[ibis_types.Value]: + # Generate offsets if current ordering id semantics are not sufficiently strict + if ordering_mode == "offset_col": + return (self._create_offset_column().name(order_col_name),) + elif ordering_mode == "string_encoded": + return (self._create_string_ordering_column().name(order_col_name),) + elif expose_hidden_cols: + return self._hidden_ordering_columns + return () + + def _create_offset_column(self) -> ibis_types.IntegerColumn: + if self._ordering.total_order_col and self._ordering.is_sequential: + offsets = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, offsets) + else: + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + offsets = ibis.row_number().over(window) + return typing.cast(ibis_types.IntegerColumn, offsets) + + def _create_string_ordering_column(self) -> ibis_types.StringColumn: + if self._ordering.total_order_col and self._ordering.is_string_encoded: + string_order_ids = self._get_any_column( + self._ordering.total_order_col.column_id + ) + return typing.cast(ibis_types.StringColumn, string_order_ids) + if ( + self._ordering.total_order_col + and self._ordering.integer_encoding.is_encoded + ): + # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers + int_values = self._get_any_column(self._ordering.total_order_col.column_id) + return encode_order_string( + typing.cast(ibis_types.IntegerColumn, int_values), + ) + else: + # Have to build string from scratch + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + row_nums = typing.cast( + ibis_types.IntegerColumn, ibis.row_number().over(window) + ) + return encode_order_string(row_nums) + + def _reproject_to_table(self) -> CompiledArrayValue: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr( + "unordered", + expose_hidden_cols=True, + ) + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): + group_by: typing.List[ibis_types.Value] = ( + [ + typing.cast( + ibis_types.Column, _as_identity(self._get_ibis_column(column)) + ) + for column in window_spec.grouping_keys + ] + if window_spec.grouping_keys + else [] + ) + if self._reduced_predicate is not None: + group_by.append(self._reduced_predicate) + if window_spec.ordering: + order_by = _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + window_spec.ordering, + ) + if not allow_ties: + # Most operator need an unambiguous ordering, so the table's total ordering is appended + order_by = tuple([*order_by, *self._ibis_order]) + elif (window_spec.following is not None) or (window_spec.preceding is not None): + # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. + order_by = tuple(self._ibis_order) + else: + # Unbound grouping window. Suitable for aggregations but not for analytic function application. + order_by = None + return ibis.window( + preceding=window_spec.preceding, + following=window_spec.following, + order_by=order_by, + group_by=group_by, + ) + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> CompiledArrayValue: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr("unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return CompiledArrayValue( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) + + def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> CompiledArrayValue: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> CompiledArrayValue: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + +class ArrayValueBuilder: + """Mutable expression class. + Use ArrayValue.builder() to create from a ArrayValue object. + """ + + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> CompiledArrayValue: + return CompiledArrayValue( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, + ) + + +def _reduce_predicate_list( + predicate_list: typing.Collection[ibis_types.BooleanValue], +) -> ibis_types.BooleanValue: + """Converts a list of predicates BooleanValues into a single BooleanValue.""" + if len(predicate_list) == 0: + raise ValueError("Cannot reduce empty list of predicates") + if len(predicate_list) == 1: + (item,) = predicate_list + return item + return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) + + +def _convert_ordering_to_table_values( + value_lookup: typing.Mapping[str, ibis_types.Value], + ordering_columns: typing.Sequence[OrderingColumnReference], +) -> typing.Sequence[ibis_types.Value]: + column_refs = ordering_columns + ordering_values = [] + for ordering_col in column_refs: + column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) + ordering_value = ( + ibis.asc(column) + if ordering_col.direction.is_ascending + else ibis.desc(column) + ) + # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. + if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): + # Force nulls to be first + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.desc(is_null_val)) + elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): + # Force nulls to be last + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.asc(is_null_val)) + ordering_values.append(ordering_value) + return ordering_values + + +def _as_identity(value: ibis_types.Value): + # Some types need to be converted to string to enable groupby + if value.type().is_float64() or value.type().is_geospatial(): + return value.cast(ibis_dtypes.str) + return value diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py new file mode 100644 index 0000000000..195d830122 --- /dev/null +++ b/bigframes/core/compile/compiler.py @@ -0,0 +1,185 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import io +import typing + +import pandas as pd + +import bigframes.core.compile as compiled +import bigframes.core.compile.single_column +import bigframes.core.nodes as nodes + +if typing.TYPE_CHECKING: + import bigframes.core + import bigframes.session + + +@functools.cache +def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Compile node into CompileArrayValue. Caches result.""" + return _compile_node(node) + + +@functools.singledispatch +def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unnrecognized node: {node}") + + +@_compile_node.register +def compile_join(node: nodes.JoinNode): + compiled_left = compile_node(node.left_child) + compiled_right = compile_node(node.right_child) + return bigframes.core.compile.single_column.join_by_column( + compiled_left, + node.left_column_ids, + compiled_right, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + + +@_compile_node.register +def compile_select(node: nodes.SelectNode): + return compile_node(node.child).select_columns(node.column_ids) + + +@_compile_node.register +def compile_drop(node: nodes.DropColumnsNode): + return compile_node(node.child).drop_columns(node.columns) + + +@_compile_node.register +def compile_readlocal(node: nodes.ReadLocalNode): + array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + + +@_compile_node.register +def compile_readgbq(node: nodes.ReadGbqNode): + return compiled.CompiledArrayValue( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + + +@_compile_node.register +def compile_promote_offsets(node: nodes.PromoteOffsetsNode): + return compile_node(node.child).promote_offsets(node.col_id) + + +@_compile_node.register +def compile_filter(node: nodes.FilterNode): + return compile_node(node.child).filter(node.predicate_id, node.keep_null) + + +@_compile_node.register +def compile_orderby(node: nodes.OrderByNode): + return compile_node(node.child).order_by(node.by, node.stable) + + +@_compile_node.register +def compile_reversed(node: nodes.ReversedNode): + return compile_node(node.child).reversed() + + +@_compile_node.register +def compile_project_unary(node: nodes.ProjectUnaryOpNode): + return compile_node(node.child).project_unary_op( + node.input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_binary(node: nodes.ProjectBinaryOpNode): + return compile_node(node.child).project_binary_op( + node.left_input_id, node.right_input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_ternary(node: nodes.ProjectTernaryOpNode): + return compile_node(node.child).project_ternary_op( + node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id + ) + + +@_compile_node.register +def compile_concat(node: nodes.ConcatNode): + compiled_nodes = [compile_node(node) for node in node.children] + return compiled_nodes[0].concat(compiled_nodes[1:]) + + +@_compile_node.register +def compile_aggregate(node: nodes.AggregateNode): + return compile_node(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna + ) + + +@_compile_node.register +def compile_corr(node: nodes.CorrNode): + return compile_node(node.child).corr_aggregate(node.corr_aggregations) + + +@_compile_node.register +def compile_window(node: nodes.WindowOpNode): + return compile_node(node.child).project_window_op( + node.column_name, + node.op, + node.window_spec, + node.output_name, + never_skip_nulls=node.never_skip_nulls, + skip_reproject_unsafe=node.skip_reproject_unsafe, + ) + + +@_compile_node.register +def compile_reproject(node: nodes.ReprojectOpNode): + return compile_node(node.child)._reproject_to_table() + + +@_compile_node.register +def compile_unpivot(node: nodes.UnpivotNode): + return compile_node(node.child).unpivot( + node.row_labels, + node.unpivot_columns, + passthrough_columns=node.passthrough_columns, + index_col_ids=node.index_col_ids, + dtype=node.dtype, + how=node.how, + ) + + +@_compile_node.register +def compile_assign(node: nodes.AssignNode): + return compile_node(node.child).assign(node.source_id, node.destination_id) + + +@_compile_node.register +def compile_assign_constant(node: nodes.AssignConstantNode): + return compile_node(node.child).assign_constant( + node.destination_id, node.value, node.dtype + ) + + +@_compile_node.register +def compiler_random_sample(node: nodes.RandomSampleNode): + return compile_node(node.child)._uniform_sampling(node.fraction) diff --git a/bigframes/core/joins/row_identity.py b/bigframes/core/compile/row_identity.py similarity index 94% rename from bigframes/core/joins/row_identity.py rename to bigframes/core/compile/row_identity.py index 76e456ec94..2e9bc0527c 100644 --- a/bigframes/core/joins/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,15 +23,16 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core as core +import bigframes.core.compile as compiled import bigframes.core.joins.name_resolution as naming +import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} def join_by_row_identity( - left: core.ArrayValue, right: core.ArrayValue, *, how: str -) -> core.ArrayValue: + left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str +) -> compiled.CompiledArrayValue: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -101,8 +102,8 @@ def join_by_row_identity( ) # Assume that left ordering is sufficient since 1:1 join over same base table join_total_order_cols = left_total_order_cols - new_ordering = core.ExpressionOrdering( - ordering_columns, total_ordering_columns=join_total_order_cols + new_ordering = orderings.ExpressionOrdering( + tuple(ordering_columns), total_ordering_columns=join_total_order_cols ) hidden_ordering_columns = [ @@ -117,8 +118,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = core.ArrayValue( - left._session, + joined_expr = compiled.CompiledArrayValue( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/compile/single_column.py similarity index 87% rename from bigframes/core/joins/single_column.py rename to bigframes/core/compile/single_column.py index 0c0e2008b5..b992aa1d1d 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core as core -import bigframes.core.joins.name_resolution as naming -import bigframes.core.joins.row_identity -import bigframes.core.ordering +import bigframes.core.compile as compiled +import bigframes.core.compile.row_identity +import bigframes.core.joins as joining +import bigframes.core.ordering as orderings def join_by_column( - left: core.ArrayValue, + left: compiled.CompiledArrayValue, left_column_ids: typing.Sequence[str], - right: core.ArrayValue, + right: compiled.CompiledArrayValue, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -42,7 +42,7 @@ def join_by_column( "right", ], allow_row_identity_join: bool = True, -) -> core.ArrayValue: +) -> compiled.CompiledArrayValue: """Join two expressions by column equality. Arguments: @@ -61,7 +61,7 @@ def join_by_column( """ if ( allow_row_identity_join - and how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW and left._table.equals(right._table) # Make sure we're joining on exactly the same column(s), at least with # regards to value its possible that they both have the same names but @@ -73,15 +73,15 @@ def join_by_column( for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.joins.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity( left, right, how=how ) else: # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - l_public_mapping, r_public_mapping = naming.JOIN_NAME_REMAPPER( + l_public_mapping, r_public_mapping = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) - l_hidden_mapping, r_hidden_mapping = naming.JoinNameRemapper( + l_hidden_mapping, r_hidden_mapping = joining.JoinNameRemapper( namespace="hidden" )(left._hidden_column_ids, right._hidden_column_ids) l_mapping = {**l_public_mapping, **l_hidden_mapping} @@ -134,8 +134,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return core.ArrayValue( - left._session, + return compiled.CompiledArrayValue( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -151,12 +150,12 @@ def value_to_join_key(value: ibis_types.Value): def join_orderings( - left: core.ExpressionOrdering, - right: core.ExpressionOrdering, + left: orderings.ExpressionOrdering, + right: orderings.ExpressionOrdering, left_id_mapping: Mapping[str, str], right_id_mapping: Mapping[str, str], left_order_dominates: bool = True, -) -> core.ExpressionOrdering: +) -> orderings.ExpressionOrdering: left_ordering_refs = [ ref.with_name(left_id_mapping[ref.column_id]) for ref in left.all_ordering_columns @@ -176,7 +175,7 @@ def join_orderings( right_total_order_cols = frozenset( [right_id_mapping[id] for id in right.total_ordering_columns] ) - return core.ExpressionOrdering( - ordering_value_columns=joined_refs, + return orderings.ExpressionOrdering( + ordering_value_columns=tuple(joined_refs), total_ordering_columns=left_total_order_cols | right_total_order_cols, ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index fb2043bea7..4e046a426f 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -195,7 +195,7 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: def shift(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -203,7 +203,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -212,7 +212,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -227,7 +227,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -391,7 +391,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) columns = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( @@ -531,7 +531,7 @@ def cumcount(self, *args, **kwargs) -> series.Series: def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -539,7 +539,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -548,7 +548,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -567,7 +567,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -600,7 +600,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) label = self._value_name if not discard_name else None diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index d18a0a38ef..f6ce084714 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -311,7 +311,7 @@ def _loc_getitem_series_or_dataframe( values = [entry[i] for entry in key] index_cols_dict[index_name] = values keys_df = bigframes.dataframe.DataFrame( - index_cols_dict, session=series_or_dataframe._get_block().expr._session + index_cols_dict, session=series_or_dataframe._get_block().expr.session ) keys_df = keys_df.set_index(temporary_index_names, drop=True) keys_df = keys_df.rename_axis(original_index_names) @@ -324,7 +324,7 @@ def _loc_getitem_series_or_dataframe( index_name = "unnamed_col" keys_df = bigframes.dataframe.DataFrame( {index_name: key}, - session=series_or_dataframe._get_block().expr._session, + session=series_or_dataframe._get_block().expr.session, ) keys_df = keys_df.set_index(index_name, drop=True) if index_name_is_none: @@ -343,7 +343,7 @@ def _loc_getitem_series_or_dataframe( elif pd.api.types.is_scalar(key): index_name = "unnamed_col" keys_df = bigframes.dataframe.DataFrame( - {index_name: [key]}, session=series_or_dataframe._get_block().expr._session + {index_name: [key]}, session=series_or_dataframe._get_block().expr.session ) keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index b9ffdff21e..6c66c36062 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,8 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -import bigframes.core.joins as joins -import bigframes.core.joins.name_resolution as join_names +import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dtypes @@ -402,7 +401,7 @@ def to_pandas(self) -> pandas.Index: dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results, dtypes) + df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) @@ -461,11 +460,10 @@ def join_mono_indexed( ) -> Tuple[IndexValue, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left._block.expr, + combined_expr = left._block.expr.join( left._block.index_columns, right._block.expr, right._block.index_columns, @@ -520,12 +518,11 @@ def join_multi_indexed( left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left_expr, + combined_expr = left_expr.join( left_join_ids, right_expr, right_join_ids, diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 3f9447aef0..5d407ec22b 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -15,11 +15,6 @@ """Helpers to join ArrayValue objects.""" from bigframes.core.joins.merge import merge -from bigframes.core.joins.row_identity import join_by_row_identity -from bigframes.core.joins.single_column import join_by_column +from bigframes.core.joins.name_resolution import JOIN_NAME_REMAPPER, JoinNameRemapper -__all__ = ( - "join_by_row_identity", - "join_by_column", - "merge", -) +__all__ = ("merge", "JoinNameRemapper", "JOIN_NAME_REMAPPER") diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py new file mode 100644 index 0000000000..7b252b164f --- /dev/null +++ b/bigframes/core/nodes.py @@ -0,0 +1,245 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +import functools +import typing +from typing import Optional, Tuple + +import pandas + +import bigframes.core.guid +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.window_spec as window +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import ibis.expr.types as ibis_types + + import bigframes.core.ordering as orderings + import bigframes.session + + +@dataclass(frozen=True) +class BigFrameNode: + """ + Immutable node for representing 2D typed array as a tree of operators. + + All subclasses must be hashable so as to be usable as caching key. + """ + + @property + def deterministic(self) -> bool: + """Whether this node will evaluates deterministically.""" + return True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + """Direct children of this node""" + return tuple([]) + + @functools.cached_property + def session(self): + sessions = [] + for child in self.child_nodes: + if child.session is not None: + sessions.append(child.session) + unique_sessions = len(set(sessions)) + if unique_sessions > 1: + raise ValueError("Cannot use combine sources from multiple sessions.") + elif unique_sessions == 1: + return sessions[0] + return None + + +@dataclass(frozen=True) +class UnaryNode(BigFrameNode): + child: BigFrameNode + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.child,) + + +@dataclass(frozen=True) +class JoinNode(BigFrameNode): + left_child: BigFrameNode + right_child: BigFrameNode + left_column_ids: typing.Tuple[str, ...] + right_column_ids: typing.Tuple[str, ...] + how: typing.Literal[ + "inner", + "left", + "outer", + "right", + ] + allow_row_identity_join: bool = True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.left_child, self.right_child) + + +@dataclass(frozen=True) +class ConcatNode(BigFrameNode): + children: Tuple[BigFrameNode, ...] + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return self.children + + +# Input Nodex +@dataclass(frozen=True) +class ReadLocalNode(BigFrameNode): + feather_bytes: bytes + column_ids: typing.Tuple[str, ...] + + +# TODO: Refactor to take raw gbq object reference +@dataclass(frozen=True) +class ReadGbqNode(BigFrameNode): + table: ibis_types.Table = field() + table_session: bigframes.session.Session = field() + columns: Tuple[ibis_types.Value, ...] = field() + hidden_ordering_columns: Tuple[ibis_types.Value, ...] = field() + ordering: orderings.ExpressionOrdering = field() + + @property + def session(self): + return (self.table_session,) + + +# Unary nodes +@dataclass(frozen=True) +class DropColumnsNode(UnaryNode): + columns: Tuple[str, ...] + + +@dataclass(frozen=True) +class PromoteOffsetsNode(UnaryNode): + col_id: str + + +@dataclass(frozen=True) +class FilterNode(UnaryNode): + predicate_id: str + keep_null: bool = False + + +@dataclass(frozen=True) +class OrderByNode(UnaryNode): + by: Tuple[OrderingColumnReference, ...] + stable: bool = False + + +@dataclass(frozen=True) +class ReversedNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class SelectNode(UnaryNode): + column_ids: typing.Tuple[str, ...] + + +@dataclass(frozen=True) +class ProjectUnaryOpNode(UnaryNode): + input_id: str + op: ops.UnaryOp + output_id: Optional[str] = None + + +@dataclass(frozen=True) +class ProjectBinaryOpNode(UnaryNode): + left_input_id: str + right_input_id: str + op: ops.BinaryOp + output_id: str + + +@dataclass(frozen=True) +class ProjectTernaryOpNode(UnaryNode): + input_id1: str + input_id2: str + input_id3: str + op: ops.TernaryOp + output_id: str + + +@dataclass(frozen=True) +class AggregateNode(UnaryNode): + aggregations: typing.Tuple[typing.Tuple[str, agg_ops.AggregateOp, str], ...] + by_column_ids: typing.Tuple[str, ...] = tuple([]) + dropna: bool = True + + +# TODO: Unify into aggregate +@dataclass(frozen=True) +class CorrNode(UnaryNode): + corr_aggregations: typing.Tuple[typing.Tuple[str, str, str], ...] + + +@dataclass(frozen=True) +class WindowOpNode(UnaryNode): + column_name: str + op: agg_ops.WindowOp + window_spec: window.WindowSpec + output_name: typing.Optional[str] = None + never_skip_nulls: bool = False + skip_reproject_unsafe: bool = False + + +@dataclass(frozen=True) +class ReprojectOpNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class UnpivotNode(UnaryNode): + row_labels: typing.Tuple[typing.Hashable, ...] + unpivot_columns: typing.Tuple[ + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ... + ] + passthrough_columns: typing.Tuple[str, ...] = () + index_col_ids: typing.Tuple[str, ...] = ("index",) + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = (pandas.Float64Dtype(),) + how: typing.Literal["left", "right"] = "left" + + +@dataclass(frozen=True) +class AssignNode(UnaryNode): + source_id: str + destination_id: str + + +@dataclass(frozen=True) +class AssignConstantNode(UnaryNode): + destination_id: str + value: typing.Hashable + dtype: typing.Optional[bigframes.dtypes.Dtype] + + +@dataclass(frozen=True) +class RandomSampleNode(UnaryNode): + fraction: float + + @property + def deterministic(self) -> bool: + return False diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index d5f07ecf91..2cecd2fe7b 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -86,7 +86,7 @@ class IntegerEncoding: class ExpressionOrdering: """Immutable object that holds information about the ordering of rows in a ArrayValue object.""" - ordering_value_columns: Sequence[OrderingColumnReference] = () + ordering_value_columns: typing.Tuple[OrderingColumnReference, ...] = () integer_encoding: IntegerEncoding = IntegerEncoding(False) string_encoding: StringEncoding = StringEncoding(False) # A table has a total ordering defined by the identities of a set of 1 or more columns. @@ -170,7 +170,7 @@ def with_column_remap(self, mapping: typing.Mapping[str, str]): mapping.get(col_id, col_id) for col_id in self.total_ordering_columns ) return ExpressionOrdering( - new_value_columns, + tuple(new_value_columns), integer_encoding=self.integer_encoding, string_encoding=self.string_encoding, total_ordering_columns=new_total_order, diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py new file mode 100644 index 0000000000..3458bfb1b8 --- /dev/null +++ b/bigframes/core/window_spec.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +import typing + +import bigframes.core.ordering as orderings + + +@dataclass(frozen=True) +class WindowSpec: + """ + Specifies a window over which aggregate and analytic function may be applied. + grouping_keys: set of column ids to group on + preceding: Number of preceding rows in the window + following: Number of preceding rows in the window + ordering: List of columns ids and ordering direction to override base ordering + """ + + grouping_keys: typing.Tuple[str, ...] = tuple() + ordering: typing.Tuple[orderings.OrderingColumnReference, ...] = tuple() + preceding: typing.Optional[int] = None + following: typing.Optional[int] = None + min_periods: int = 0 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 96c74de1cd..d0c14cac60 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -172,9 +172,7 @@ def __init__( if isinstance(dt, pandas.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: @@ -301,7 +299,7 @@ def values(self) -> numpy.ndarray: @property def _session(self) -> bigframes.Session: - return self._get_block().expr._session + return self._get_block().expr.session def __len__(self): rows, _ = self.shape @@ -1109,7 +1107,7 @@ def _assign_single_item( ) local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr._session + {k: v}, session=self._get_block().expr.session ) # local_df is likely (but not guarunteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE @@ -2205,7 +2203,7 @@ def to_csv( field_delimiter=sep, header=header, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_json( @@ -2247,7 +2245,7 @@ def to_json( format="JSON", export_options={}, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_gbq( @@ -2276,7 +2274,7 @@ def to_gbq( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, - default_project=self._block.expr._session.bqclient.project, + default_project=self._block.expr.session.bqclient.project, ), ) @@ -2323,7 +2321,7 @@ def to_parquet( format="PARQUET", export_options=export_options, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_dict( @@ -2466,7 +2464,7 @@ def _run_io_query( """Executes a query job presenting this dataframe and returns the destination table.""" expr = self._block.expr - session = expr._session + session = expr.session sql = self._create_io_query(index=index, ordering_id=ordering_id) _, query_job = session._start_query( sql=sql, job_config=job_config # type: ignore diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics.py index 3bcb621f74..5731b946ca 100644 --- a/bigframes/ml/metrics.py +++ b/bigframes/ml/metrics.py @@ -96,7 +96,7 @@ def roc_curve( y_true_series, y_score_series = utils.convert_to_series(y_true, y_score) - session = y_true_series._block.expr._session + session = y_true_series._block.expr.session # We operate on rows, so, remove the index if there is one # TODO(bmil): check that the indexes are equivalent before removing diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index b9abb2cc03..d33befe4da 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -94,9 +94,7 @@ def __init__( if isinstance(dt, pd.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: diff --git a/bigframes/series.py b/bigframes/series.py index b65581a3ac..032f894f06 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -29,7 +29,6 @@ import bigframes.constants as constants import bigframes.core -from bigframes.core import WindowSpec import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -43,6 +42,7 @@ import bigframes.core.scalar as scalars import bigframes.core.utils as utils import bigframes.core.window +import bigframes.core.window_spec import bigframes.dataframe import bigframes.dtypes import bigframes.formatting_helpers as formatter @@ -369,43 +369,43 @@ def between(self, left, right, inclusive="both"): def cumsum(self) -> Series: return self._apply_window_op( - agg_ops.sum_op, bigframes.core.WindowSpec(following=0) + agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0) ) def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=limit, following=0) + window = bigframes.core.window_spec.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=0, following=limit) + window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def cummax(self) -> Series: return self._apply_window_op( - agg_ops.max_op, bigframes.core.WindowSpec(following=0) + agg_ops.max_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cummin(self) -> Series: return self._apply_window_op( - agg_ops.min_op, bigframes.core.WindowSpec(following=0) + agg_ops.min_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cumprod(self) -> Series: return self._apply_window_op( - agg_ops.product_op, bigframes.core.WindowSpec(following=0) + agg_ops.product_op, bigframes.core.window_spec.WindowSpec(following=0) ) def shift(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -807,7 +807,7 @@ def mode(self) -> Series: block, max_value_count_col_id = block.apply_window_op( value_count_col_id, agg_ops.max_op, - window_spec=WindowSpec(), + window_spec=bigframes.core.window_spec.WindowSpec(), ) block, is_mode_col_id = block.apply_binary_op( value_count_col_id, @@ -1011,9 +1011,7 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> Any: return self._block.get_stat(self._value_column, op) def _apply_window_op( - self, - op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, + self, op: agg_ops.WindowOp, window_spec: bigframes.core.window_spec.WindowSpec ): block = self._block block, result_id = block.apply_window_op( @@ -1072,7 +1070,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = WindowSpec( + window_spec = bigframes.core.window_spec.WindowSpec( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -1080,7 +1078,9 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = WindowSpec(following=0, min_periods=min_periods) + window_spec = bigframes.core.window_spec.WindowSpec( + following=0, min_periods=min_periods + ) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True ) @@ -1253,7 +1253,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): "Cannot reindex with index with different nlevels" ) new_indexer = bigframes.dataframe.DataFrame( - index=index, session=self._get_block().expr._session + index=index, session=self._get_block().expr.session )[[]] # multiindex join is senstive to index names, so we will set all these result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( @@ -1417,7 +1417,7 @@ def map( elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( {"keys": list(arg.keys()), self.name: list(arg.values())}, - session=self._get_block().expr._session, + session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") elif callable(arg): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index bd36820b6e..edac94ef3b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,7 @@ import bigframes.core.blocks as blocks import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference +import bigframes.core.ordering as orderings import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers @@ -208,6 +209,10 @@ def _session_dataset_id(self): def _project(self): return self.bqclient.project + def __hash__(self): + # Stable hash needed to use in expression tree + return hash(self._session_id) + def _create_and_bind_bq_session(self): """Create a BQ session and bind the session id with clients to capture BQ activities: go/bigframes-transient-data""" @@ -594,11 +599,13 @@ def _read_gbq_table( # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. is_total_ordering = True - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols + ] + ), total_ordering_columns=frozenset(total_ordering_cols), ) @@ -636,10 +643,13 @@ def _read_gbq_table( distinct_count = row["distinct_count"] is_total_ordering = total_count == distinct_count - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) for column_id in index_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in index_cols + ] + ), total_ordering_columns=frozenset(index_cols), ) @@ -715,7 +725,7 @@ def _read_gbq_with_ordering( index_cols: Iterable[str] = (), index_labels: Iterable[Optional[str]] = (), hidden_cols: Iterable[str] = (), - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, is_total_ordering: bool = False, api_name: str, ) -> dataframe.DataFrame: @@ -828,7 +838,7 @@ def _read_ibis( index_labels: Iterable[blocks.Label], column_keys: Iterable[str], column_labels: Iterable[blocks.Label], - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" @@ -845,7 +855,7 @@ def _read_ibis( hidden_ordering_columns.append(table_expression[ref.column_id]) block = blocks.Block( - core.ArrayValue( + core.ArrayValue.from_ibis( self, table_expression, columns, hidden_ordering_columns, ordering ), index_columns=[index_col.get_name() for index_col in index_cols], @@ -961,8 +971,8 @@ def _read_pandas( ) self._start_generic_job(load_job) - ordering = core.ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ordering_col)], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ordering_col)]), total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) @@ -1305,7 +1315,7 @@ def _create_sequential_ordering( table: ibis_types.Table, index_cols: Iterable[str] = (), api_name: str = "", - ) -> Tuple[ibis_types.Table, core.ExpressionOrdering]: + ) -> Tuple[ibis_types.Table, orderings.ExpressionOrdering]: # Since this might also be used as the index, don't use the default # "ordering ID" name. default_ordering_name = guid.generate_guid("bigframes_ordering_") @@ -1322,8 +1332,8 @@ def _create_sequential_ordering( f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" ) ordering_reference = core.OrderingColumnReference(default_ordering_name) - ordering = core.ExpressionOrdering( - ordering_value_columns=[ordering_reference], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([ordering_reference]), total_ordering_columns=frozenset([default_ordering_name]), integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), ) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index f7fc4eaa8f..084b723fba 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -98,7 +98,7 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): bf.options.display.progress_bar = "terminal" - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() @@ -117,7 +117,7 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c9510290b6..05d8b84185 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2858,7 +2858,7 @@ def test_map_series_input(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) pd_result = scalars_pandas_df.int64_too.map(pd_map_series) @@ -2877,7 +2877,7 @@ def test_map_series_input_duplicates_error(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) with pytest.raises(pd.errors.InvalidIndexError): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 127a88a760..bf72e444eb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -318,7 +318,6 @@ def test_read_pandas(session, scalars_dfs): _, scalars_pandas_df = scalars_dfs df = session.read_pandas(scalars_pandas_df) - assert df._block._expr._ordering is not None result = df.to_pandas() expected = scalars_pandas_df @@ -350,9 +349,8 @@ def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default pandas_df = scalars_pandas_df_default_index.copy() pandas_df["rowid"] = np.arange(pandas_df.shape[0]) - df = session.read_pandas(pandas_df) - total_order_col = df._block._expr._ordering.total_order_col - assert total_order_col and total_order_col.column_id == "rowid_2" + df_roundtrip = session.read_pandas(pandas_df).to_pandas() + pd.testing.assert_frame_equal(df_roundtrip, pandas_df, check_dtype=False) def test_read_pandas_tokyo( @@ -385,7 +383,6 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -441,7 +438,6 @@ def test_read_csv_local_default_engine(session, scalars_dfs, sep): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -976,7 +972,6 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): orient="records", ) - assert df._block._expr._ordering is not None pd.testing.assert_index_equal(df.columns, scalars_df.columns) # The auto detects of BigQuery load job have restrictions to detect the bytes, diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index a7e9b5a84b..86715d090c 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -18,8 +18,6 @@ import bigframes.core.blocks as blocks -from .. import resources - @pytest.mark.parametrize( ("data",), @@ -76,9 +74,8 @@ ) def test_block_from_local(data): expected = pandas.DataFrame(data) - session = resources.create_pandas_session({}) - block = blocks.block_from_local(data, session=session) + block = blocks.block_from_local(data) pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index_labels) == tuple(expected.index.names) diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 0a68600a35..f660d774f0 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -22,6 +22,7 @@ import bigframes import bigframes.core as core +import bigframes.core.ordering import bigframes.session.clients """Utilities for creating test resources.""" @@ -61,14 +62,20 @@ def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Sess def create_arrayvalue( df: pandas.DataFrame, total_ordering_columns: List[str] -) -> bigframes.core.ArrayValue: +) -> core.ArrayValue: session = create_pandas_session({"test_table": df}) ibis_table = session.ibis_client.table("test_table") columns = tuple(ibis_table[key] for key in ibis_table.columns) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference(column) for column in total_ordering_columns], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple( + [core.OrderingColumnReference(column) for column in total_ordering_columns] + ), total_ordering_columns=frozenset(total_ordering_columns), ) - return core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + return core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + hidden_ordering_columns=(), + ordering=ordering, ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 69b9e79807..d9672b2635 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -16,6 +16,7 @@ import pandas import bigframes.core as core +import bigframes.core.ordering import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -37,15 +38,19 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ) ibis_table = session.ibis_client.table("test_table") columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"]) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference("col1")], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple([core.OrderingColumnReference("col1")]), total_ordering_columns=frozenset(["col1"]), ) - actual = core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + actual = core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + ordering=ordering, + hidden_ordering_columns=(), ) - assert actual._table is ibis_table - assert len(actual.columns) == 3 + assert actual.compile()._table is ibis_table + assert len(actual.column_ids) == 3 def test_arrayvalue_with_get_column_type(): @@ -78,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value._get_ibis_column("col1") + col1 = value.compile()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -95,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value._get_ibis_column("col1") + expr = value.compile()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -112,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._to_ibis_expr("unordered") + actual = expr.compile()._to_ibis_expr("unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -131,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")) - assert value.columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() + assert value.compile().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -147,7 +152,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") + expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 4 @@ -166,7 +171,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") + expr = value.project_ternary_op( + "col2", "col3", "col4", ops.where_op, "col5" + ).compile() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 5 @@ -188,7 +195,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ) + ).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" @@ -207,7 +214,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) + expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" From f37d0b0e55b84103b5e4d7c70dcc371421564fb2 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 26 Oct 2023 18:20:58 -0700 Subject: [PATCH 14/32] fix: fix bug with column names under repeated column assignment (#150) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 23 +++++++++++------------ tests/system/small/test_dataframe.py | 22 ++++++++++++++++++++-- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d0c14cac60..b19748e93d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1112,19 +1112,18 @@ def _assign_single_item( # local_df is likely (but not guarunteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE - this_offsets_col_id = bigframes.core.guid.generate_guid() - this_expr = self._get_block()._expr.promote_offsets(this_offsets_col_id) - block = blocks.Block( - expr=this_expr, - index_labels=self.index.names, - index_columns=self._block.index_columns, - column_labels=[this_offsets_col_id] + list(self._block.value_columns), - ) # offsets are temporarily the first value column, label set to id - this_df_with_offsets = DataFrame(data=block) - join_result = this_df_with_offsets.join( - other=local_df, on=this_offsets_col_id, how="left" + new_column_block = local_df._block + original_index_column_ids = self._block.index_columns + self_block = self._block.reset_index(drop=False) + result_index, (get_column_left, get_column_right) = self_block.index.join( + new_column_block.index, how="left", block_identity_join=True ) - return join_result.drop(columns=[this_offsets_col_id]) + result_block = result_index._block + result_block = result_block.set_index( + [get_column_left[col_id] for col_id in original_index_column_ids], + index_labels=self._block.index_labels, + ) + return DataFrame(result_block) else: return self._assign_scalar(k, v) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a746a1867c..e459e3bee3 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -505,14 +505,32 @@ def test_assign_new_column_w_setitem_list(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() # set the custom index - pd_df = pd_df.set_index("string_col") - bf_df = bf_df.set_index("string_col") + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] From aba301c9406cff495cbdc6bdba4b51d14a1ef18b Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 27 Oct 2023 22:46:13 +0000 Subject: [PATCH 15/32] test: refactor remote function tests (#147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes moves the tests that deploy cloud function to large remote function tests, and the tests that do not make call to bigquery service to unit tests. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/large/test_remote_function.py | 90 ++++++++++++ tests/system/small/test_remote_function.py | 156 ++++----------------- tests/unit/test_remote_function.py | 28 ++++ 3 files changed, 148 insertions(+), 126 deletions(-) create mode 100644 tests/unit/test_remote_function.py diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 730a1dbde4..c8f8f66eba 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -27,6 +27,7 @@ import pytest import test_utils.prefixer +import bigframes from bigframes.remote_function import ( get_cloud_function_name, get_remote_function_locations, @@ -1120,3 +1121,92 @@ def plusone(x): ) for dir_ in dirs_to_cleanup: shutil.rmtree(dir_) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_via_session_context_connection_setter( + scalars_dfs, dataset_id, bq_cf_connection +): + # Creating a session scoped only to this test as we would be setting a + # property in it + context = bigframes.BigQueryOptions() + context.bq_connection = bq_cf_connection + session = bigframes.connect(context) + + try: + # Without an explicit bigquery connection, the one present in Session, + # set via context setter would be used. Without an explicit `reuse` the + # default behavior of reuse=True will take effect. Please note that the + # udf is same as the one used in other tests in this file so the underlying + # cloud function would be common with reuse=True. Since we are using a + # unique dataset_id, even though the cloud function would be reused, the bq + # remote function would still be created, making use of the bq connection + # set in the BigQueryOptions above. + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_default_connection(session, scalars_dfs, dataset_id): + try: + + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d024a57ded..89907a53df 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -13,14 +13,11 @@ # limitations under the License. from google.cloud import bigquery -from ibis.backends.bigquery import datatypes as bq_types -from ibis.expr import datatypes as ibis_types import pandas as pd import pytest import bigframes from bigframes import remote_function as rf -import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal_ignore_ordering @@ -65,45 +62,14 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: - return bigframes.Session(bigframes.BigQueryOptions(bq_connection=bq_cf_connection)) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_specified( - bq_cf_connection_location, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_mistached( - bq_cf_connection_location_mistached, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_mistached) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_project_specified( - bq_cf_connection_location_project, +def session_with_bq_connection_and_permanent_dataset( + bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_project) + session = bigframes.Session( + bigframes.BigQueryOptions(bq_connection=bq_cf_connection) ) - - -def test_supported_types_correspond(): - # The same types should be representable by the supported Python and BigQuery types. - ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} - ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS - } - - assert ibis_types_from_python == ibis_types_from_bigquery + session._session_dataset = bigquery.Dataset(dataset_id_permanent) + return session @pytest.mark.flaky(retries=2, delay=120) @@ -311,11 +277,13 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): +def test_remote_function_direct_session_param( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): @rf.remote_function( [int], int, - session=session_with_bq_connection, + session=session_with_bq_connection_and_permanent_dataset, ) def square(x): return x * x @@ -345,7 +313,9 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): +def test_remote_function_via_session_default( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -353,7 +323,7 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection.remote_function([int], int) + @session_with_bq_connection_and_permanent_dataset.remote_function([int], int) def square(x): return x * x @@ -421,87 +391,15 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_context_connection_setter( - scalars_dfs, dataset_id, bq_cf_connection +def test_dataframe_applymap( + session_with_bq_connection_and_permanent_dataset, scalars_dfs ): - # Creating a session scoped only to this test as we would be setting a - # property in it - context = bigframes.BigQueryOptions() - context.bq_connection = bq_cf_connection - session = bigframes.connect(context) - - # Without an explicit bigquery connection, the one present in Session, - # set via context setter would be used. Without an explicit `reuse` the - # default behavior of reuse=True will take effect. Please note that the - # udf is same as the one used in other tests in this file so the underlying - # cloud function would be common with reuse=True. Since we are using a - # unique dataset_id, even though the cloud function would be reused, the bq - # remote function would still be created, making use of the bq connection - # set in the BigQueryOptions above. - @session.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_default_connection(scalars_dfs, dataset_id): - @bpd.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -524,11 +422,15 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap_na_ignore( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -549,11 +451,13 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection, scalars_dfs): +def test_series_map(session_with_bq_connection_and_permanent_dataset, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -635,7 +539,7 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id): +def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py new file mode 100644 index 0000000000..540f4020d3 --- /dev/null +++ b/tests/unit/test_remote_function.py @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ibis.backends.bigquery import datatypes as bq_types +from ibis.expr import datatypes as ibis_types + +from bigframes import remote_function as rf + + +def test_supported_types_correspond(): + # The same types should be representable by the supported Python and BigQuery types. + ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} + ibis_types_from_bigquery = { + bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + } + + assert ibis_types_from_python == ibis_types_from_bigquery From 53bb2cd227fcdfff438386d24e4f06d1973798c1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 27 Oct 2023 19:22:28 -0700 Subject: [PATCH 16/32] feat: add dataframe melt (#116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/blocks.py | 41 +++++++++++++++++- bigframes/dataframe.py | 38 ++++++++++++++++ tests/system/small/test_dataframe.py | 43 +++++++++++++++++++ tests/system/small/test_multiindex.py | 28 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 28 ++++++++++++ 5 files changed, 176 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cc13edeaf9..635e7db865 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1356,13 +1356,50 @@ def stack(self, how="left", levels: int = 1): index_columns = [*added_index_columns, *self.index_columns] index_labels = [*new_index_level_names, *self._index_labels] - block = Block( + return Block( unpivot_expr, index_columns=index_columns, column_labels=result_index, index_labels=index_labels, ) - return block + + def melt( + self, + id_vars=typing.Sequence[str], + value_vars=typing.Sequence[str], + var_names=typing.Sequence[typing.Hashable], + value_name: typing.Hashable = "value", + ): + # TODO: Implement col_level and ignore_index + unpivot_col_id = guid.generate_guid() + var_col_ids = tuple([guid.generate_guid() for _ in var_names]) + # single unpivot col + unpivot_col = (unpivot_col_id, tuple(value_vars)) + value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] + + dtype = self._expr.get_column_type(value_vars[0]) + + unpivot_expr = self._expr.unpivot( + row_labels=value_labels, + passthrough_columns=id_vars, + unpivot_columns=(unpivot_col,), + index_col_ids=var_col_ids, + dtype=dtype, + how="right", + ) + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + # Need to reorder to get id_vars before var_col and unpivot_col + unpivot_expr = unpivot_expr.select_columns( + [index_id, *id_vars, *var_col_ids, unpivot_col_id] + ) + + return Block( + unpivot_expr, + column_labels=[*id_labels, *var_names, value_name], + index_columns=[index_id], + ) def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b19748e93d..b4fa8f5b18 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1675,6 +1675,44 @@ def idxmin(self) -> bigframes.series.Series: def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) + def melt( + self, + id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + var_name: typing.Union[ + typing.Hashable, typing.Sequence[typing.Hashable] + ] = None, + value_name: typing.Hashable = "value", + ): + if var_name is None: + # Determine default var_name. Attempt to use column labels if they are unique + if self.columns.nlevels > 1: + if len(set(self.columns.names)) == len(self.columns.names): + var_name = self.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(self.columns.names))] + else: + var_name = self.columns.name or "variable" + + var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,) + + if id_vars is not None: + id_col_ids = [self._resolve_label_exact(col) for col in id_vars] + else: + id_col_ids = [] + if value_vars is not None: + val_col_ids = [self._resolve_label_exact(col) for col in value_vars] + else: + val_col_ids = [ + col_id + for col_id in self._block.value_columns + if col_id not in id_col_ids + ] + + return DataFrame( + self._block.melt(id_col_ids, val_col_ids, var_name, value_name) + ) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e459e3bee3..b503f9a31d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1919,6 +1919,49 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_df_unstack(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a87dacae04..d6bf46f77c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -752,6 +752,34 @@ def test_column_multi_index_stack(level): ) +def test_column_multi_index_melt(): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + + level1 = pandas.Index(["b", "a", "b"]) + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) + + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.melt().to_pandas() + pd_result = pd_df.melt() + + # BigFrames uses different string and int types, but values are identical + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 13a81b4645..67836a8fd2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2010,6 +2010,34 @@ def idxmax(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def melt(self, id_vars, value_vars, var_name, value_name): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. + + Returns: + DataFrame: Unpivoted DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. From 2bf4bcc6de15fe686c56c506d3fa91680b64aeed Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:07:58 -0700 Subject: [PATCH 17/32] docs: add artithmetic df sample code (#153) * docs: add artithmetic df sample code * fix: address comments --- bigframes/session/__init__.py | 4 +- .../bigframes_vendored/pandas/core/frame.py | 494 +++++++++++++++++- 2 files changed, 492 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index edac94ef3b..a60436d3c6 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -354,7 +354,7 @@ def read_gbq_query( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Simple query input: + Simple query input: >>> df = bpd.read_gbq_query(''' ... SELECT @@ -370,7 +370,7 @@ def read_gbq_query( [2 rows x 3 columns] - Preserve ordering in a query input. + Preserve ordering in a query input. >>> df = bpd.read_gbq_query(''' ... SELECT diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 67836a8fd2..013d170114 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -697,6 +697,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'): @@ -978,9 +979,9 @@ def sort_values( Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - kind (str, default `quicksort`): - Choice of sorting algorithm. Accepts 'quicksort’, ‘mergesort’, - ‘heapsort’, ‘stable’. Ignored except when determining whether to + kind (str, default 'quicksort'): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default `last`): ``{'first', 'last'}``, default 'last' Puts NaNs at the beginning @@ -1014,6 +1015,29 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].eq(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``==``: + >>> df["degrees"] == 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1036,6 +1060,30 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ne(360) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``!=``: + + >>> df["degrees"] != 360 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1061,6 +1109,30 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].le(180) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<=``: + + >>> df["degrees"] <= 180 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1087,6 +1159,30 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].lt(180) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<``: + + >>> df["degrees"] < 180 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1113,6 +1209,30 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ge(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>=``: + + >>> df["degrees"] >= 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1139,6 +1259,28 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].gt(360) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>``: + + >>> df["degrees"] > 360 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1162,6 +1304,32 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].add(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + (df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1185,6 +1353,32 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].sub(df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + + You can also use arithmetic operator ``-``: + + >>> df['A'] - (df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1208,6 +1402,29 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rsub(df['B']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``-``: + + >>> df['B'] - (df['A']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1231,6 +1448,32 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1254,6 +1497,32 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].truediv(df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + + You can also use arithmetic operator ``/``: + + >>> df['A'] / (df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1277,6 +1546,29 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rtruediv(df['B']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + + It's equivalent to using arithmetic operator: ``/``: + + >>> df['B'] / (df['A']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1300,6 +1592,32 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].floordiv(df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + + You can also use arithmetic operator ``//``: + + >>> df['A'] // (df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1323,6 +1641,29 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rfloordiv(df['B']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``//``: + + >>> df['B'] // (df['A']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1346,6 +1687,32 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mod(df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + + You can also use arithmetic operator ``%``: + + >>> df['A'] % (df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + Args: other: Any single or multiple element data structure, or list-like object. @@ -1369,6 +1736,29 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rmod(df['B']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``%``: + + >>> df['B'] % (df['A']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1382,7 +1772,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pow(self, other, axis: str | int = "columns") -> DataFrame: - """Get Exponential power of dataframe and other, element-wise (binary operator `pow`). + """Get Exponential power of dataframe and other, element-wise (binary operator `**`). Equivalent to ``dataframe ** other``, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, `rpow`. @@ -1393,6 +1783,32 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].pow(df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + + You can also use arithmetic operator ``**``: + + >>> df['A'] ** (df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1417,6 +1833,29 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rpow(df['B']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``**``: + + >>> df['B'] ** (df['A']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1438,6 +1877,21 @@ def combine( to element-wise combine columns. The row and column indexes of the resulting DataFrame will be the union of the two. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + [2 rows x 2 columns] + Args: other (DataFrame): The DataFrame to merge column-wise. @@ -1468,6 +1922,20 @@ def combine_first(self, other) -> DataFrame: second.loc[index, col] are not missing values, upon calling first.combine_first(second). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + [2 rows x 2 columns] + Args: other (DataFrame): Provided DataFrame to use to fill null values. @@ -1485,6 +1953,24 @@ def update( Aligns on indices. There is no return value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = bpd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + Args: other (DataFrame, or object coercible into a DataFrame): Should have at least one matching index/column label From 343414abbe3915012202452de82bf172445b56ba Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 30 Oct 2023 18:56:14 +0000 Subject: [PATCH 18/32] feat: Implement operator `@` for `DataFrame.dot` (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 🦕 --- bigframes/dataframe.py | 2 ++ tests/system/small/test_dataframe.py | 33 +++++++++++++++++++++++++++ tests/system/small/test_multiindex.py | 16 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b4fa8f5b18..3804a4475e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2709,3 +2709,5 @@ def get_right_id(id): result = result[other.name].rename() return result + + __matmul__ = dot diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b503f9a31d..c96faa3526 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3264,6 +3264,23 @@ def test_df_dot( ) +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3278,3 +3295,19 @@ def test_df_dot_series( bf_result, pd_result, ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d6bf46f77c..bc35f633fd 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -998,6 +998,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + # right multi-index right_index = pandas.MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "bb")]) bf1 = bpd.DataFrame(left_matrix) @@ -1005,6 +1008,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + def test_column_multi_index_dot_not_supported(): left_matrix = [[1, 2, 3], [2, 5, 7]] @@ -1022,6 +1028,11 @@ def test_column_multi_index_dot_not_supported(): ): bf1.dot(bf2) + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 + # right multi-columns bf1 = bpd.DataFrame(left_matrix) bf2 = bpd.DataFrame(right_matrix, columns=multi_level_columns) @@ -1029,3 +1040,8 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1.dot(bf2) + + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 From 4eac10d34e482eba7f531297cd82a8770778bfd1 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 30 Oct 2023 22:03:46 +0000 Subject: [PATCH 19/32] fix: fix typo and address comments --- bigframes/utils/log_adapter.py | 7 ++++++- setup.py | 2 +- testing/constraints-3.9.txt | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bigframes/utils/log_adapter.py b/bigframes/utils/log_adapter.py index d498b9a06f..8bdc6206fb 100644 --- a/bigframes/utils/log_adapter.py +++ b/bigframes/utils/log_adapter.py @@ -21,8 +21,13 @@ def method_logger(method): @functools.wraps(method) def wrapper(*args, **kwargs): api_method_name = str(method.__name__) - if not api_method_name.startswith("__"): + if not api_method_name.startswith("_"): add_api_method(api_method_name) + try: + result = method(*args, **kwargs) + return result + except Exception as e: + raise e return wrapper diff --git a/setup.py b/setup.py index b57a784a2c..a2fc69be17 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ "sqlalchemy >=1.4,<3.0dev", "ipywidgets >=7.7.1", "humanize >= 4.6.0", - "logruru >=0.6.0", + "loguru >=0.6.0", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f43d3b4ca0..47a7248640 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -96,6 +96,7 @@ pytest-xdist==3.2.1 python-dateutil==2.8.2 pytz==2023.3 PyYAML==6.0 +loguru==0.6.0 readme-renderer==37.3 requests==2.27.1 requests-oauthlib==1.3.1 From 39321e427d5bb3e89f315f771e08e6c89d53d601 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 3 Nov 2023 05:33:26 +0000 Subject: [PATCH 20/32] fix: address comments --- bigframes/session/_io/bigquery.py | 32 +- bigframes/utils/log_adapter.py | 3 +- tests/unit/session/test_io_bigquery.py | 44 ++- .../bigframes_vendored/cpython/Lib/LICENSE | 279 ++++++++++++++++++ .../cpython/Lib/functools.py | 101 +++++++ 5 files changed, 426 insertions(+), 33 deletions(-) create mode 100644 third_party/bigframes_vendored/cpython/Lib/LICENSE create mode 100644 third_party/bigframes_vendored/cpython/Lib/functools.py diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index e195b064b3..ea1aedef43 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -15,6 +15,7 @@ """Private module: Helpers for I/O operations.""" import datetime +import itertools import textwrap import types from typing import Dict, Iterable, Optional, Sequence, Union @@ -33,27 +34,16 @@ def create_job_configs_labels( ) -> Dict[str, str]: # If there is no label set if job_configs_labels is None: - labels = {} - label_values = list(api_methods) - else: - labels = job_configs_labels.copy() - cur_labels_len = len(job_configs_labels) - api_methods_len = len(api_methods) - # If the total number of labels is under the limit of labels count - if cur_labels_len + api_methods_len <= MAX_LABELS_COUNT: - label_values = list(api_methods) - # We capture the latest label if it is out of the length limit of labels count - else: - added_api_len = cur_labels_len + api_methods_len - MAX_LABELS_COUNT - label_values = list(api_methods)[-added_api_len:] - - for i, label_value in enumerate(label_values): - if job_configs_labels is not None: - label_key = "bigframes-api-" + str(i + len(job_configs_labels)) - else: - label_key = "bigframes-api-" + str(i) - labels[label_key] = label_value - return labels + job_configs_labels = {} + + labels = list( + itertools.chain( + job_configs_labels.keys(), + (f"bigframes-api-{i}" for i in range(len(api_methods))), + ) + ) + values = list(itertools.chain(job_configs_labels.values(), api_methods)) + return dict(zip(labels[:MAX_LABELS_COUNT], values[:MAX_LABELS_COUNT])) def create_export_csv_statement( diff --git a/bigframes/utils/log_adapter.py b/bigframes/utils/log_adapter.py index 8bdc6206fb..0491c6d1b5 100644 --- a/bigframes/utils/log_adapter.py +++ b/bigframes/utils/log_adapter.py @@ -21,7 +21,8 @@ def method_logger(method): @functools.wraps(method) def wrapper(*args, **kwargs): api_method_name = str(method.__name__) - if not api_method_name.startswith("_"): + # Track regular and "dunder" methods + if api_method_name.startswith("__") or not api_method_name.startswith("_"): add_api_method(api_method_name) try: result = method(*args, **kwargs) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index ab59680553..89d355240b 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -19,6 +19,7 @@ import google.cloud.bigquery as bigquery import pytest +import bigframes import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq from bigframes.utils import log_adapter @@ -46,8 +47,8 @@ def test_create_job_configs_labels_length_limit_not_met(): expected_dict = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "bigframes-api-2": "df-agg", - "bigframes-api-3": "series-mode", + "bigframes-api-0": "df-agg", + "bigframes-api-1": "series-mode", } assert labels is not None assert len(labels) == 4 @@ -67,8 +68,8 @@ def test_create_job_configs_labels_log_adaptor_under_length_limit(): expected_dict = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "bigframes-api-2": "df-agg", - "bigframes-api-3": "series-mode", + "bigframes-api-0": "df-agg", + "bigframes-api-1": "series-mode", } assert labels is not None assert len(labels) == 4 @@ -93,24 +94,44 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): expected_dict = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "bigframes-api-2": "head", + "bigframes-api-0": "__init__", + "bigframes-api-1": "head", + "bigframes-api-2": "__init__", "bigframes-api-3": "max", + "bigframes-api-4": "__init__", } assert labels is not None - assert len(labels) == 4 + assert len(labels) == 7 assert labels == expected_dict +def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + log_adapter._api_methods = [] + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running methods more than the labels' length limit + for i in range(66): + df.head() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "head" in labels.values() + assert "__init__" in labels.values() + + def test_create_job_configs_labels_length_limit_met(): cur_labels = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } - for i in range(61): - key = f"bigframes-api-{i}" + for i in range(60): + key = f"bigframes-api-test-{i}" value = f"test{i}" cur_labels[key] = value - # If cur_labels length is 63, we can only add one label from api_methods + # If cur_labels length is 62, we can only add one label from api_methods log_adapter._api_methods = [] df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) # Test running two methods @@ -123,8 +144,9 @@ def test_create_job_configs_labels_length_limit_met(): ) assert labels is not None assert len(labels) == 64 - assert "head" not in labels.values() - assert "max" in labels.values() + assert "head" in labels.values() + assert "__init__" in labels.values() + assert "max" not in labels.values() assert "bigframes-api" in labels.keys() assert "source" in labels.keys() diff --git a/third_party/bigframes_vendored/cpython/Lib/LICENSE b/third_party/bigframes_vendored/cpython/Lib/LICENSE new file mode 100644 index 0000000000..f26bcf4d2d --- /dev/null +++ b/third_party/bigframes_vendored/cpython/Lib/LICENSE @@ -0,0 +1,279 @@ +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/third_party/bigframes_vendored/cpython/Lib/functools.py b/third_party/bigframes_vendored/cpython/Lib/functools.py new file mode 100644 index 0000000000..31e8dbed14 --- /dev/null +++ b/third_party/bigframes_vendored/cpython/Lib/functools.py @@ -0,0 +1,101 @@ +"""functools.py - Tools for working with functions and callable objects +""" + +# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation; +# All Rights Reserved + +# Python module wrapper for _functools C module +# to allow utilities written in Python to be added +# to the functools module. +# Written by Nick Coghlan , +# Raymond Hettinger , +# and Łukasz Langa . +# Copyright (C) 2006-2013 Python Software Foundation. +# See C source code for _functools credits/copyright + +# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# -------------------------------------------- + +# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"), +# and the Individual or Organization ("Licensee") accessing and otherwise +# using this software ("Python") in source or binary form and its associated +# documentation. + +# 2. Subject to the terms and conditions of this License Agreement, PSF hereby +# grants Licensee a nonexclusive, royalty-free, world-wide license to +# reproduce, analyze, test, perform and/or display publicly, prepare +# derivative works, distribute, and otherwise use Python alone or in any +# derivative version, provided, however, that PSF's License Agreement and +# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004, +# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, +# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in +# Python alone or in any derivative version prepared by Licensee. + +# 3. In the event Licensee prepares a derivative work that is based on or +# incorporates Python or any part thereof, and wants to make the derivative +# work available to others as provided herein, then Licensee hereby agrees to +# include in any such work a brief summary of the changes made to Python. + +# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES +# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT +# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF +# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF +# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. + +# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY +# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF +# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE +# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +# 6. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. + +# 7. Nothing in this License Agreement shall be deemed to create any +# relationship of agency, partnership, or joint venture between PSF and +# Licensee. This License Agreement does not grant permission to use PSF +# trademarks or trade name in a trademark sense to endorse or promote products +# or services of Licensee, or any third party. + +# 8. By copying, installing or otherwise using Python, Licensee agrees to be +# bound by the terms and conditions of this License Agreement. +from bigframes import constants + +WRAPPER_ASSIGNMENTS = ( + "__module__", + "__name__", + "__qualname__", + "__doc__", + "__annotations__", + "__type_params__", +) +WRAPPER_UPDATES = ("__dict__",) + + +def update_wrapper( + wrapper, wrapped, assigned=WRAPPER_ASSIGNMENTS, updated=WRAPPER_UPDATES +): + """Update a wrapper function to look like the wrapped function + + wrapper is the function to be updated + wrapped is the original function + assigned is a tuple naming the attributes assigned directly + from the wrapped function to the wrapper function (defaults to + functools.WRAPPER_ASSIGNMENTS) + updated is a tuple naming the attributes of the wrapper that + are updated with the corresponding attribute from the wrapped + function (defaults to functools.WRAPPER_UPDATES) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def wraps(wrapped, assigned=WRAPPER_ASSIGNMENTS, updated=WRAPPER_UPDATES): + """Decorator factory to apply update_wrapper() to a wrapper function + + Returns a decorator that invokes update_wrapper() with the decorated + function as the wrapper argument and the arguments to wraps() as the + remaining arguments. Default arguments are as for update_wrapper(). + This is a convenience function to simplify applying partial() to + update_wrapper(). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From aebcf117d6b3fa2657a2eccb6ebacd1cb2674561 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 3 Nov 2023 17:41:28 +0000 Subject: [PATCH 21/32] Remove utils folder and refactor it in core directory --- bigframes/core/__init__.py | 3 +++ bigframes/{utils => core}/log_adapter.py | 0 bigframes/operations/datetimes.py | 2 +- bigframes/operations/strings.py | 2 +- bigframes/operations/structs.py | 2 +- bigframes/session/__init__.py | 2 +- bigframes/utils/__init__.py | 17 ----------------- tests/unit/session/test_io_bigquery.py | 2 +- 8 files changed, 8 insertions(+), 22 deletions(-) rename bigframes/{utils => core}/log_adapter.py (100%) delete mode 100644 bigframes/utils/__init__.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4653f0ab6a..1f7786cb93 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -25,6 +25,7 @@ import bigframes.core.compile as compiled import bigframes.core.guid +from bigframes.core.log_adapter import class_logger, method_logger import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference import bigframes.core.ordering as orderings @@ -39,6 +40,8 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" +__all__ = ["class_logger", "method_logger"] + @dataclass(frozen=True) class ArrayValue: diff --git a/bigframes/utils/log_adapter.py b/bigframes/core/log_adapter.py similarity index 100% rename from bigframes/utils/log_adapter.py rename to bigframes/core/log_adapter.py diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 7275b7ee34..a8a33beb57 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -14,10 +14,10 @@ from __future__ import annotations +from bigframes.core import log_adapter import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.indexes.accessor as vendordt diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 6102a63bc5..201b19abe8 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -18,11 +18,11 @@ from typing import cast, Literal, Optional, Union import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.strings.accessor as vendorstr # Maps from python to re2 diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index bdf759371e..b2ae98f378 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -18,11 +18,11 @@ import ibis.expr.types as ibis_types +from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations import bigframes.operations.base import bigframes.series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0253aae6fc..e73562eacd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -64,6 +64,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.guid as guid @@ -76,7 +77,6 @@ from bigframes.remote_function import remote_function as bigframes_rf import bigframes.session._io.bigquery as bigframes_io import bigframes.session.clients -from bigframes.utils import log_adapter import bigframes.version # Even though the ibis.backends.bigquery.registry import is unused, it's needed diff --git a/bigframes/utils/__init__.py b/bigframes/utils/__init__.py deleted file mode 100644 index 82f1eeda55..0000000000 --- a/bigframes/utils/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from bigframes.utils.log_adapter import class_logger, method_logger - -__all__ = ["class_logger", "method_logger"] diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 89d355240b..8114b32e02 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -20,9 +20,9 @@ import pytest import bigframes +from bigframes.core import log_adapter import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq -from bigframes.utils import log_adapter def test_create_job_configs_labels_is_none(): From ec526b50e52433e85686c6402b02fbfbfc44f990 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 3 Nov 2023 17:41:28 +0000 Subject: [PATCH 22/32] Remove utils folder and refactor it in core directory --- bigframes/core/__init__.py | 3 +++ bigframes/core/groupby/__init__.py | 2 +- bigframes/{utils => core}/log_adapter.py | 0 bigframes/core/window/__init__.py | 2 +- bigframes/dataframe.py | 2 +- bigframes/operations/datetimes.py | 2 +- bigframes/operations/strings.py | 2 +- bigframes/operations/structs.py | 2 +- bigframes/series.py | 2 +- bigframes/session/__init__.py | 2 +- bigframes/utils/__init__.py | 17 ----------------- tests/unit/session/test_io_bigquery.py | 2 +- 12 files changed, 12 insertions(+), 26 deletions(-) rename bigframes/{utils => core}/log_adapter.py (100%) delete mode 100644 bigframes/utils/__init__.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4653f0ab6a..1f7786cb93 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -25,6 +25,7 @@ import bigframes.core.compile as compiled import bigframes.core.guid +from bigframes.core.log_adapter import class_logger, method_logger import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference import bigframes.core.ordering as orderings @@ -39,6 +40,8 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" +__all__ = ["class_logger", "method_logger"] + @dataclass(frozen=True) class ArrayValue: diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 4e046a426f..0bf3768895 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -19,6 +19,7 @@ import pandas as pd import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks @@ -30,7 +31,6 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby diff --git a/bigframes/utils/log_adapter.py b/bigframes/core/log_adapter.py similarity index 100% rename from bigframes/utils/log_adapter.py rename to bigframes/core/log_adapter.py diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index d84af70bfa..240715b6df 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -16,10 +16,10 @@ import typing +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 07af10a506..6e2c59143a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -40,6 +40,7 @@ import bigframes._config.display_options as display_options import bigframes.constants as constants import bigframes.core +from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -56,7 +57,6 @@ import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 7275b7ee34..a8a33beb57 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -14,10 +14,10 @@ from __future__ import annotations +from bigframes.core import log_adapter import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.indexes.accessor as vendordt diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 6102a63bc5..201b19abe8 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -18,11 +18,11 @@ from typing import cast, Literal, Optional, Union import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.strings.accessor as vendorstr # Maps from python to re2 diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index bdf759371e..b2ae98f378 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -18,11 +18,11 @@ import ibis.expr.types as ibis_types +from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations import bigframes.operations.base import bigframes.series -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors diff --git a/bigframes/series.py b/bigframes/series.py index cb293a19f9..5c07d1eda8 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -29,6 +29,7 @@ import bigframes.constants as constants import bigframes.core +from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -52,7 +53,6 @@ import bigframes.operations.datetimes as dt import bigframes.operations.strings as strings import bigframes.operations.structs as structs -from bigframes.utils import log_adapter import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0253aae6fc..e73562eacd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -64,6 +64,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.constants as constants +from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.guid as guid @@ -76,7 +77,6 @@ from bigframes.remote_function import remote_function as bigframes_rf import bigframes.session._io.bigquery as bigframes_io import bigframes.session.clients -from bigframes.utils import log_adapter import bigframes.version # Even though the ibis.backends.bigquery.registry import is unused, it's needed diff --git a/bigframes/utils/__init__.py b/bigframes/utils/__init__.py deleted file mode 100644 index 82f1eeda55..0000000000 --- a/bigframes/utils/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from bigframes.utils.log_adapter import class_logger, method_logger - -__all__ = ["class_logger", "method_logger"] diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 89d355240b..8114b32e02 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -20,9 +20,9 @@ import pytest import bigframes +from bigframes.core import log_adapter import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq -from bigframes.utils import log_adapter def test_create_job_configs_labels_is_none(): From 3a94c234539406b8c1678f12aff8dc901267d19c Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Fri, 3 Nov 2023 18:27:12 +0000 Subject: [PATCH 23/32] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/unit/session/test_io_bigquery.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 7ac67d556c..301f25f829 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -19,7 +19,6 @@ import google.cloud.bigquery as bigquery import pytest - import bigframes from bigframes.core import log_adapter import bigframes.pandas as bpd @@ -171,7 +170,6 @@ def test_create_table_clone_doesnt_clone_anonymous_datasets(): session._start_query.assert_not_called() - def test_create_table_clone_sets_expiration(): session = mock.create_autospec(bigframes.session.Session) source = bigquery.TableReference.from_string( From d84c56912bc7a019f769d9f95ec081c2fad8480a Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 3 Nov 2023 18:35:26 +0000 Subject: [PATCH 24/32] fix merge conflicts --- bigframes/session/_io/bigquery.py | 1 + tests/unit/session/test_io_bigquery.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index be8b307422..f17ddfe727 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -20,6 +20,7 @@ import itertools import textwrap import types +import typing from typing import Dict, Iterable, Optional, Sequence, Union import uuid diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 7ac67d556c..e946a272c1 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -19,10 +19,10 @@ import google.cloud.bigquery as bigquery import pytest - import bigframes from bigframes.core import log_adapter import bigframes.pandas as bpd +import bigframes.session import bigframes.session._io.bigquery as io_bq @@ -171,7 +171,6 @@ def test_create_table_clone_doesnt_clone_anonymous_datasets(): session._start_query.assert_not_called() - def test_create_table_clone_sets_expiration(): session = mock.create_autospec(bigframes.session.Session) source = bigquery.TableReference.from_string( From a87bcb85dd19f71b6e3a59a715d8e57c7e1cbf21 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 03:51:43 +0000 Subject: [PATCH 25/32] redesign the log adapter --- bigframes/core/__init__.py | 3 - bigframes/core/log_adapter.py | 64 +++- bigframes/core/nodes.py | 1 + bigframes/session/__init__.py | 4 +- bigframes/session/_io/bigquery.py | 2 +- setup.py | 1 - testing/constraints-3.9.txt | 1 - .../bigframes_vendored/cpython/Lib/LICENSE | 279 ------------------ .../cpython/Lib/functools.py | 101 ------- 9 files changed, 52 insertions(+), 404 deletions(-) delete mode 100644 third_party/bigframes_vendored/cpython/Lib/LICENSE delete mode 100644 third_party/bigframes_vendored/cpython/Lib/functools.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 50593aee42..b640692bc8 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -26,7 +26,6 @@ import bigframes.core.compile.compiled as compiled import bigframes.core.compile.compiler as compiler import bigframes.core.guid -from bigframes.core.log_adapter import class_logger, method_logger import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference import bigframes.core.ordering as orderings @@ -42,8 +41,6 @@ ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" -__all__ = ["class_logger", "method_logger"] - @dataclass(frozen=True) class ArrayValue: diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 0491c6d1b5..727a8eb084 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -1,21 +1,43 @@ -import functools +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -from loguru import logger +import functools +import threading -_log_file_path = None -_logger = logger -_api_methods = [] +_lock = threading.Lock() +MAX_LABELS_COUNT = 64 -def class_logger(decorated_cls): +def class_logger(api_methods=None): """Decorator that adds logging functionality to each method of the class.""" - for attr_name, attr_value in decorated_cls.__dict__.items(): - if callable(attr_value): - setattr(decorated_cls, attr_name, method_logger(attr_value)) - return decorated_cls + def decorator(decorated_cls): + for attr_name, attr_value in decorated_cls.__dict__.items(): + if callable(attr_value): + setattr(decorated_cls, attr_name, method_logger(attr_value)) + + # Initialize or extend _api_methods attribute + decorated_cls._api_methods = getattr(decorated_cls, "_api_methods", []) + if api_methods: + decorated_cls._api_methods.extend(api_methods) + + return decorated_cls -def method_logger(method): + return decorator + + +def method_logger(method, cls): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) @@ -23,7 +45,7 @@ def wrapper(*args, **kwargs): api_method_name = str(method.__name__) # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): - add_api_method(api_method_name) + add_api_method(api_method_name, cls) try: result = method(*args, **kwargs) return result @@ -33,6 +55,18 @@ def wrapper(*args, **kwargs): return wrapper -def add_api_method(method: str): - global _api_methods - _api_methods.append(method) +def add_api_method(api_method_name, cls): + global _lock + with _lock: + # Push the method to the front of the _api_methods list + cls._api_methods.insert(0, api_method_name) + # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) + cls._api_methods = cls._api_methods[:MAX_LABELS_COUNT] + + +def get_and_reset_api_methods(cls): + global _lock + with _lock: + previous_api_methods = list(cls._api_methods) + cls._api_methods.clear() + return previous_api_methods diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 8f1e2e5e73..04668ecb18 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import annotations from dataclasses import dataclass, field diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 44947406c4..62125296a5 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1365,14 +1365,12 @@ def _start_query( """ Starts query job and waits for results. """ - api_methods = log_adapter._api_methods + api_methods = log_adapter.get_and_reset_api_methods(self) job_config = self._prepare_job_config(job_config) job_config.labels = bigframes_io.create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods ) query_job = self.bqclient.query(sql, job_config=job_config) - # Clear out the global api logger - log_adapter._api_methods = [] opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 4029ce7aa4..b2a6ea3472 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -41,7 +41,7 @@ def create_job_configs_labels( labels = list( itertools.chain( job_configs_labels.keys(), - (f"bigframes-api-{i}" for i in range(len(api_methods))), + (f"recent-bigframes-api-{i}" for i in range(len(api_methods))), ) ) values = list(itertools.chain(job_configs_labels.values(), api_methods)) diff --git a/setup.py b/setup.py index a2fc69be17..29eacb74a9 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,6 @@ "sqlalchemy >=1.4,<3.0dev", "ipywidgets >=7.7.1", "humanize >= 4.6.0", - "loguru >=0.6.0", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 47a7248640..f43d3b4ca0 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -96,7 +96,6 @@ pytest-xdist==3.2.1 python-dateutil==2.8.2 pytz==2023.3 PyYAML==6.0 -loguru==0.6.0 readme-renderer==37.3 requests==2.27.1 requests-oauthlib==1.3.1 diff --git a/third_party/bigframes_vendored/cpython/Lib/LICENSE b/third_party/bigframes_vendored/cpython/Lib/LICENSE deleted file mode 100644 index f26bcf4d2d..0000000000 --- a/third_party/bigframes_vendored/cpython/Lib/LICENSE +++ /dev/null @@ -1,279 +0,0 @@ -A. HISTORY OF THE SOFTWARE -========================== - -Python was created in the early 1990s by Guido van Rossum at Stichting -Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands -as a successor of a language called ABC. Guido remains Python's -principal author, although it includes many contributions from others. - -In 1995, Guido continued his work on Python at the Corporation for -National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) -in Reston, Virginia where he released several versions of the -software. - -In May 2000, Guido and the Python core development team moved to -BeOpen.com to form the BeOpen PythonLabs team. In October of the same -year, the PythonLabs team moved to Digital Creations, which became -Zope Corporation. In 2001, the Python Software Foundation (PSF, see -https://www.python.org/psf/) was formed, a non-profit organization -created specifically to own Python-related Intellectual Property. -Zope Corporation was a sponsoring member of the PSF. - -All Python releases are Open Source (see https://opensource.org for -the Open Source Definition). Historically, most, but not all, Python -releases have also been GPL-compatible; the table below summarizes -the various releases. - - Release Derived Year Owner GPL- - from compatible? (1) - - 0.9.0 thru 1.2 1991-1995 CWI yes - 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes - 1.6 1.5.2 2000 CNRI no - 2.0 1.6 2000 BeOpen.com no - 1.6.1 1.6 2001 CNRI yes (2) - 2.1 2.0+1.6.1 2001 PSF no - 2.0.1 2.0+1.6.1 2001 PSF yes - 2.1.1 2.1+2.0.1 2001 PSF yes - 2.1.2 2.1.1 2002 PSF yes - 2.1.3 2.1.2 2002 PSF yes - 2.2 and above 2.1.1 2001-now PSF yes - -Footnotes: - -(1) GPL-compatible doesn't mean that we're distributing Python under - the GPL. All Python licenses, unlike the GPL, let you distribute - a modified version without making your changes open source. The - GPL-compatible licenses make it possible to combine Python with - other software that is released under the GPL; the others don't. - -(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, - because its license has a choice of law clause. According to - CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 - is "not incompatible" with the GPL. - -Thanks to the many outside volunteers who have worked under Guido's -direction to make these releases possible. - - -B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON -=============================================================== - -Python software and documentation are licensed under the -Python Software Foundation License Version 2. - -Starting with Python 3.8.6, examples, recipes, and other code in -the documentation are dual licensed under the PSF License Version 2 -and the Zero-Clause BSD license. - -Some software incorporated into Python is under different licenses. -The licenses are listed with code falling under that license. - - -PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 --------------------------------------------- - -1. This LICENSE AGREEMENT is between the Python Software Foundation -("PSF"), and the Individual or Organization ("Licensee") accessing and -otherwise using this software ("Python") in source or binary form and -its associated documentation. - -2. Subject to the terms and conditions of this License Agreement, PSF hereby -grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, -analyze, test, perform and/or display publicly, prepare derivative works, -distribute, and otherwise use Python alone or in any derivative version, -provided, however, that PSF's License Agreement and PSF's notice of copyright, -i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; -All Rights Reserved" are retained in Python alone or in any derivative version -prepared by Licensee. - -3. In the event Licensee prepares a derivative work that is based on -or incorporates Python or any part thereof, and wants to make -the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to Python. - -4. PSF is making Python available to Licensee on an "AS IS" -basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT -INFRINGE ANY THIRD PARTY RIGHTS. - -5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, -OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. - -7. Nothing in this License Agreement shall be deemed to create any -relationship of agency, partnership, or joint venture between PSF and -Licensee. This License Agreement does not grant permission to use PSF -trademarks or trade name in a trademark sense to endorse or promote -products or services of Licensee, or any third party. - -8. By copying, installing or otherwise using Python, Licensee -agrees to be bound by the terms and conditions of this License -Agreement. - - -BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 -------------------------------------------- - -BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 - -1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an -office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the -Individual or Organization ("Licensee") accessing and otherwise using -this software in source or binary form and its associated -documentation ("the Software"). - -2. Subject to the terms and conditions of this BeOpen Python License -Agreement, BeOpen hereby grants Licensee a non-exclusive, -royalty-free, world-wide license to reproduce, analyze, test, perform -and/or display publicly, prepare derivative works, distribute, and -otherwise use the Software alone or in any derivative version, -provided, however, that the BeOpen Python License is retained in the -Software, alone or in any derivative version prepared by Licensee. - -3. BeOpen is making the Software available to Licensee on an "AS IS" -basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT -INFRINGE ANY THIRD PARTY RIGHTS. - -4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE -SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS -AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY -DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -5. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. - -6. This License Agreement shall be governed by and interpreted in all -respects by the law of the State of California, excluding conflict of -law provisions. Nothing in this License Agreement shall be deemed to -create any relationship of agency, partnership, or joint venture -between BeOpen and Licensee. This License Agreement does not grant -permission to use BeOpen trademarks or trade names in a trademark -sense to endorse or promote products or services of Licensee, or any -third party. As an exception, the "BeOpen Python" logos available at -http://www.pythonlabs.com/logos.html may be used according to the -permissions granted on that web page. - -7. By copying, installing or otherwise using the software, Licensee -agrees to be bound by the terms and conditions of this License -Agreement. - - -CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 ---------------------------------------- - -1. This LICENSE AGREEMENT is between the Corporation for National -Research Initiatives, having an office at 1895 Preston White Drive, -Reston, VA 20191 ("CNRI"), and the Individual or Organization -("Licensee") accessing and otherwise using Python 1.6.1 software in -source or binary form and its associated documentation. - -2. Subject to the terms and conditions of this License Agreement, CNRI -hereby grants Licensee a nonexclusive, royalty-free, world-wide -license to reproduce, analyze, test, perform and/or display publicly, -prepare derivative works, distribute, and otherwise use Python 1.6.1 -alone or in any derivative version, provided, however, that CNRI's -License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) -1995-2001 Corporation for National Research Initiatives; All Rights -Reserved" are retained in Python 1.6.1 alone or in any derivative -version prepared by Licensee. Alternately, in lieu of CNRI's License -Agreement, Licensee may substitute the following text (omitting the -quotes): "Python 1.6.1 is made available subject to the terms and -conditions in CNRI's License Agreement. This Agreement together with -Python 1.6.1 may be located on the internet using the following -unique, persistent identifier (known as a handle): 1895.22/1013. This -Agreement may also be obtained from a proxy server on the internet -using the following URL: http://hdl.handle.net/1895.22/1013". - -3. In the event Licensee prepares a derivative work that is based on -or incorporates Python 1.6.1 or any part thereof, and wants to make -the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to Python 1.6.1. - -4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" -basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT -INFRINGE ANY THIRD PARTY RIGHTS. - -5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, -OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. - -7. This License Agreement shall be governed by the federal -intellectual property law of the United States, including without -limitation the federal copyright law, and, to the extent such -U.S. federal law does not apply, by the law of the Commonwealth of -Virginia, excluding Virginia's conflict of law provisions. -Notwithstanding the foregoing, with regard to derivative works based -on Python 1.6.1 that incorporate non-separable material that was -previously distributed under the GNU General Public License (GPL), the -law of the Commonwealth of Virginia shall govern this License -Agreement only as to issues arising under or with respect to -Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this -License Agreement shall be deemed to create any relationship of -agency, partnership, or joint venture between CNRI and Licensee. This -License Agreement does not grant permission to use CNRI trademarks or -trade name in a trademark sense to endorse or promote products or -services of Licensee, or any third party. - -8. By clicking on the "ACCEPT" button where indicated, or by copying, -installing or otherwise using Python 1.6.1, Licensee agrees to be -bound by the terms and conditions of this License Agreement. - - ACCEPT - - -CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 --------------------------------------------------- - -Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, -The Netherlands. All rights reserved. - -Permission to use, copy, modify, and distribute this software and its -documentation for any purpose and without fee is hereby granted, -provided that the above copyright notice appear in all copies and that -both that copyright notice and this permission notice appear in -supporting documentation, and that the name of Stichting Mathematisch -Centrum or CWI not be used in advertising or publicity pertaining to -distribution of the software without specific, written prior -permission. - -STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO -THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE -FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION ----------------------------------------------------------------------- - -Permission to use, copy, modify, and/or distribute this software for any -purpose with or without fee is hereby granted. - -THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH -REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, -INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -PERFORMANCE OF THIS SOFTWARE. diff --git a/third_party/bigframes_vendored/cpython/Lib/functools.py b/third_party/bigframes_vendored/cpython/Lib/functools.py deleted file mode 100644 index 31e8dbed14..0000000000 --- a/third_party/bigframes_vendored/cpython/Lib/functools.py +++ /dev/null @@ -1,101 +0,0 @@ -"""functools.py - Tools for working with functions and callable objects -""" - -# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation; -# All Rights Reserved - -# Python module wrapper for _functools C module -# to allow utilities written in Python to be added -# to the functools module. -# Written by Nick Coghlan , -# Raymond Hettinger , -# and Łukasz Langa . -# Copyright (C) 2006-2013 Python Software Foundation. -# See C source code for _functools credits/copyright - -# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -# -------------------------------------------- - -# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"), -# and the Individual or Organization ("Licensee") accessing and otherwise -# using this software ("Python") in source or binary form and its associated -# documentation. - -# 2. Subject to the terms and conditions of this License Agreement, PSF hereby -# grants Licensee a nonexclusive, royalty-free, world-wide license to -# reproduce, analyze, test, perform and/or display publicly, prepare -# derivative works, distribute, and otherwise use Python alone or in any -# derivative version, provided, however, that PSF's License Agreement and -# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004, -# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, -# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in -# Python alone or in any derivative version prepared by Licensee. - -# 3. In the event Licensee prepares a derivative work that is based on or -# incorporates Python or any part thereof, and wants to make the derivative -# work available to others as provided herein, then Licensee hereby agrees to -# include in any such work a brief summary of the changes made to Python. - -# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES -# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT -# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF -# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF -# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. - -# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY -# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF -# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE -# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -# 6. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. - -# 7. Nothing in this License Agreement shall be deemed to create any -# relationship of agency, partnership, or joint venture between PSF and -# Licensee. This License Agreement does not grant permission to use PSF -# trademarks or trade name in a trademark sense to endorse or promote products -# or services of Licensee, or any third party. - -# 8. By copying, installing or otherwise using Python, Licensee agrees to be -# bound by the terms and conditions of this License Agreement. -from bigframes import constants - -WRAPPER_ASSIGNMENTS = ( - "__module__", - "__name__", - "__qualname__", - "__doc__", - "__annotations__", - "__type_params__", -) -WRAPPER_UPDATES = ("__dict__",) - - -def update_wrapper( - wrapper, wrapped, assigned=WRAPPER_ASSIGNMENTS, updated=WRAPPER_UPDATES -): - """Update a wrapper function to look like the wrapped function - - wrapper is the function to be updated - wrapped is the original function - assigned is a tuple naming the attributes assigned directly - from the wrapped function to the wrapper function (defaults to - functools.WRAPPER_ASSIGNMENTS) - updated is a tuple naming the attributes of the wrapper that - are updated with the corresponding attribute from the wrapped - function (defaults to functools.WRAPPER_UPDATES) - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - - -def wraps(wrapped, assigned=WRAPPER_ASSIGNMENTS, updated=WRAPPER_UPDATES): - """Decorator factory to apply update_wrapper() to a wrapper function - - Returns a decorator that invokes update_wrapper() with the decorated - function as the wrapper argument and the arguments to wraps() as the - remaining arguments. Default arguments are as for update_wrapper(). - This is a convenience function to simplify applying partial() to - update_wrapper(). - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 53a99f96170b68bb39805189344138b19118fa07 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 18:09:12 +0000 Subject: [PATCH 26/32] Make the global _api_methods and lock threads --- bigframes/core/log_adapter.py | 42 ++++++++------------ bigframes/session/__init__.py | 3 +- tests/unit/core/test_log_adapter.py | 25 ++++++------ tests/unit/session/test_io_bigquery.py | 55 +++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 44 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 9d488d6bc8..ee8ff8020b 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -14,33 +14,22 @@ import functools import threading +from typing import List _lock = threading.Lock() MAX_LABELS_COUNT = 64 +_api_methods: List = [] -def class_logger(api_methods=None): +def class_logger(decorated_cls): """Decorator that adds logging functionality to each method of the class.""" + for attr_name, attr_value in decorated_cls.__dict__.items(): + if callable(attr_value): + setattr(decorated_cls, attr_name, method_logger(attr_value)) + return decorated_cls - def decorator(decorated_cls): - for attr_name, attr_value in decorated_cls.__dict__.items(): - if callable(attr_value): - setattr( - decorated_cls, attr_name, method_logger(attr_value, decorated_cls) - ) - - # Initialize or extend _api_methods attribute - decorated_cls._api_methods = getattr(decorated_cls, "_api_methods", []) - if api_methods: - decorated_cls._api_methods.extend(api_methods) - - return decorated_cls - - return decorator - - -def method_logger(method, cls): +def method_logger(method): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) @@ -48,7 +37,7 @@ def wrapper(*args, **kwargs): api_method_name = str(method.__name__) # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): - add_api_method(api_method_name, cls) + add_api_method(api_method_name) try: result = method(*args, **kwargs) return result @@ -58,18 +47,19 @@ def wrapper(*args, **kwargs): return wrapper -def add_api_method(api_method_name, cls): +def add_api_method(api_method_name): global _lock + global _api_methods with _lock: # Push the method to the front of the _api_methods list - cls._api_methods.insert(0, api_method_name) + _api_methods.insert(0, api_method_name) # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) - cls._api_methods = cls._api_methods[:MAX_LABELS_COUNT] + _api_methods = _api_methods[:MAX_LABELS_COUNT] -def get_and_reset_api_methods(cls): +def get_and_reset_api_methods(): global _lock with _lock: - previous_api_methods = list(cls._api_methods) - cls._api_methods.clear() + previous_api_methods = list(_api_methods) + _api_methods.clear() return previous_api_methods diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2b17e67213..eb2264ba65 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -113,7 +113,6 @@ def _is_query(query_or_table: str) -> bool: return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None -@log_adapter.class_logger class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -1387,7 +1386,7 @@ def _start_query( """ Starts query job and waits for results. """ - api_methods = log_adapter.get_and_reset_api_methods(self) + api_methods = log_adapter.get_and_reset_api_methods() job_config = self._prepare_job_config(job_config) job_config.labels = bigframes_io.create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index e8d1bedc9a..29f2519069 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -8,7 +8,7 @@ @pytest.fixture def test_instance(): # Create a simple class for testing - @log_adapter.class_logger() + @log_adapter.class_logger class TestClass: def method1(self): pass @@ -19,29 +19,26 @@ def method2(self): return TestClass() -def test_class_logger_decorator(test_instance): - # Ensure that the class logger decorator adds _api_methods attribute - assert hasattr(test_instance, "_api_methods") - assert test_instance._api_methods == [] - +def test_method_logging(test_instance): + test_instance.method1() + test_instance.method2() -def test_add_api_method(test_instance): - # Ensure that add_api_method correctly adds a method to _api_methods - log_adapter.add_api_method("method3", test_instance) - assert test_instance._api_methods == ["method3"] + # Check if the methods were added to the _api_methods list + api_methods = log_adapter.get_and_reset_api_methods() + assert api_methods == ["method2", "method1"] def test_add_api_method_limit(test_instance): # Ensure that add_api_method correctly adds a method to _api_methods for i in range(70): - log_adapter.add_api_method("method3", test_instance) - assert len(test_instance._api_methods) == MAX_LABELS_COUNT + test_instance.method2() + assert len(log_adapter._api_methods) == MAX_LABELS_COUNT def test_get_and_reset_api_methods(test_instance): # Ensure that get_and_reset_api_methods returns a copy and resets the list test_instance.method1() test_instance.method2() - previous_methods = log_adapter.get_and_reset_api_methods(test_instance) + previous_methods = log_adapter.get_and_reset_api_methods() assert previous_methods == ["method2", "method1"] - assert test_instance._api_methods == [] + assert log_adapter._api_methods == [] diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 62199147ef..5ceee44084 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -20,6 +20,8 @@ import pytest import bigframes +from bigframes.core import log_adapter +import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq @@ -56,6 +58,50 @@ def test_create_job_configs_labels_length_limit_not_met(): assert labels == expected_dict +def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "recent-bigframes-api-0": "__init__", + "recent-bigframes-api-1": "max", + "recent-bigframes-api-2": "__init__", + "recent-bigframes-api-3": "head", + "recent-bigframes-api-4": "__init__", + } + assert labels is not None + assert len(labels) == 7 + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running methods more than the labels' length limit + for i in range(66): + df.head() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "head" in labels.values() + assert "__init__" in labels.values() + + def test_create_job_configs_labels_length_limit_met(): cur_labels = { "bigframes-api": "read_pandas", @@ -66,14 +112,19 @@ def test_create_job_configs_labels_length_limit_met(): value = f"test{i}" cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods - api_methods = ["agg", "series-mode", "head"] + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods labels = io_bq.create_job_configs_labels( job_configs_labels=cur_labels, api_methods=api_methods ) assert labels is not None assert len(labels) == 64 - assert "agg" in labels.values() + assert "max" in labels.values() + assert "__init__" in labels.values() assert "head" not in labels.values() assert "bigframes-api" in labels.keys() assert "source" in labels.keys() From 1c3deb5c6b473ca7e65ddc30e6bea62ca6e737fe Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 18:09:12 +0000 Subject: [PATCH 27/32] Make the global _api_methods and lock threads --- bigframes/core/log_adapter.py | 42 ++++++++------------ bigframes/session/__init__.py | 3 +- tests/unit/core/test_log_adapter.py | 27 ++++++------- tests/unit/session/test_io_bigquery.py | 55 +++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 45 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 9d488d6bc8..ee8ff8020b 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -14,33 +14,22 @@ import functools import threading +from typing import List _lock = threading.Lock() MAX_LABELS_COUNT = 64 +_api_methods: List = [] -def class_logger(api_methods=None): +def class_logger(decorated_cls): """Decorator that adds logging functionality to each method of the class.""" + for attr_name, attr_value in decorated_cls.__dict__.items(): + if callable(attr_value): + setattr(decorated_cls, attr_name, method_logger(attr_value)) + return decorated_cls - def decorator(decorated_cls): - for attr_name, attr_value in decorated_cls.__dict__.items(): - if callable(attr_value): - setattr( - decorated_cls, attr_name, method_logger(attr_value, decorated_cls) - ) - - # Initialize or extend _api_methods attribute - decorated_cls._api_methods = getattr(decorated_cls, "_api_methods", []) - if api_methods: - decorated_cls._api_methods.extend(api_methods) - - return decorated_cls - - return decorator - - -def method_logger(method, cls): +def method_logger(method): """Decorator that adds logging functionality to a method.""" @functools.wraps(method) @@ -48,7 +37,7 @@ def wrapper(*args, **kwargs): api_method_name = str(method.__name__) # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): - add_api_method(api_method_name, cls) + add_api_method(api_method_name) try: result = method(*args, **kwargs) return result @@ -58,18 +47,19 @@ def wrapper(*args, **kwargs): return wrapper -def add_api_method(api_method_name, cls): +def add_api_method(api_method_name): global _lock + global _api_methods with _lock: # Push the method to the front of the _api_methods list - cls._api_methods.insert(0, api_method_name) + _api_methods.insert(0, api_method_name) # Keep the list length within the maximum limit (adjust MAX_LABELS_COUNT as needed) - cls._api_methods = cls._api_methods[:MAX_LABELS_COUNT] + _api_methods = _api_methods[:MAX_LABELS_COUNT] -def get_and_reset_api_methods(cls): +def get_and_reset_api_methods(): global _lock with _lock: - previous_api_methods = list(cls._api_methods) - cls._api_methods.clear() + previous_api_methods = list(_api_methods) + _api_methods.clear() return previous_api_methods diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2b17e67213..eb2264ba65 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -113,7 +113,6 @@ def _is_query(query_or_table: str) -> bool: return re.search(r"\s", query_or_table.strip(), re.MULTILINE) is not None -@log_adapter.class_logger class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, @@ -1387,7 +1386,7 @@ def _start_query( """ Starts query job and waits for results. """ - api_methods = log_adapter.get_and_reset_api_methods(self) + api_methods = log_adapter.get_and_reset_api_methods() job_config = self._prepare_job_config(job_config) job_config.labels = bigframes_io.create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index e8d1bedc9a..48a28f4f3a 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -8,7 +8,7 @@ @pytest.fixture def test_instance(): # Create a simple class for testing - @log_adapter.class_logger() + @log_adapter.class_logger class TestClass: def method1(self): pass @@ -19,29 +19,26 @@ def method2(self): return TestClass() -def test_class_logger_decorator(test_instance): - # Ensure that the class logger decorator adds _api_methods attribute - assert hasattr(test_instance, "_api_methods") - assert test_instance._api_methods == [] - +def test_method_logging(test_instance): + test_instance.method1() + test_instance.method2() -def test_add_api_method(test_instance): - # Ensure that add_api_method correctly adds a method to _api_methods - log_adapter.add_api_method("method3", test_instance) - assert test_instance._api_methods == ["method3"] + # Check if the methods were added to the _api_methods list + api_methods = log_adapter.get_and_reset_api_methods() + assert api_methods == ["method2", "method1"] def test_add_api_method_limit(test_instance): # Ensure that add_api_method correctly adds a method to _api_methods for i in range(70): - log_adapter.add_api_method("method3", test_instance) - assert len(test_instance._api_methods) == MAX_LABELS_COUNT + test_instance.method2() + assert len(log_adapter._api_methods) == MAX_LABELS_COUNT def test_get_and_reset_api_methods(test_instance): # Ensure that get_and_reset_api_methods returns a copy and resets the list test_instance.method1() test_instance.method2() - previous_methods = log_adapter.get_and_reset_api_methods(test_instance) - assert previous_methods == ["method2", "method1"] - assert test_instance._api_methods == [] + previous_methods = log_adapter.get_and_reset_api_methods() + assert previous_methods is not None + assert log_adapter._api_methods == [] diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 62199147ef..5ceee44084 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -20,6 +20,8 @@ import pytest import bigframes +from bigframes.core import log_adapter +import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq @@ -56,6 +58,50 @@ def test_create_job_configs_labels_length_limit_not_met(): assert labels == expected_dict +def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): + cur_labels = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + } + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=cur_labels, api_methods=api_methods + ) + expected_dict = { + "bigframes-api": "read_pandas", + "source": "bigquery-dataframes-temp", + "recent-bigframes-api-0": "__init__", + "recent-bigframes-api-1": "max", + "recent-bigframes-api-2": "__init__", + "recent-bigframes-api-3": "head", + "recent-bigframes-api-4": "__init__", + } + assert labels is not None + assert len(labels) == 7 + assert labels == expected_dict + + +def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running methods more than the labels' length limit + for i in range(66): + df.head() + api_methods = log_adapter._api_methods + + labels = io_bq.create_job_configs_labels( + job_configs_labels=None, api_methods=api_methods + ) + assert labels is not None + assert len(labels) == 64 + assert "head" in labels.values() + assert "__init__" in labels.values() + + def test_create_job_configs_labels_length_limit_met(): cur_labels = { "bigframes-api": "read_pandas", @@ -66,14 +112,19 @@ def test_create_job_configs_labels_length_limit_met(): value = f"test{i}" cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods - api_methods = ["agg", "series-mode", "head"] + df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + # Test running two methods + df.head() + df.max() + api_methods = log_adapter._api_methods labels = io_bq.create_job_configs_labels( job_configs_labels=cur_labels, api_methods=api_methods ) assert labels is not None assert len(labels) == 64 - assert "agg" in labels.values() + assert "max" in labels.values() + assert "__init__" in labels.values() assert "head" not in labels.values() assert "bigframes-api" in labels.keys() assert "source" in labels.keys() From 115de2742a70c026983b233a999187e6a4d29d2a Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 19:58:37 +0000 Subject: [PATCH 28/32] address comments --- bigframes/core/log_adapter.py | 5 ----- bigframes/session/_io/bigquery.py | 1 - tests/unit/core/test_log_adapter.py | 14 ++++++++++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index ee8ff8020b..b2bf2c999c 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -38,11 +38,6 @@ def wrapper(*args, **kwargs): # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): add_api_method(api_method_name) - try: - result = method(*args, **kwargs) - return result - except Exception as e: - raise e return wrapper diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index b2a6ea3472..dae73301e7 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -34,7 +34,6 @@ def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], api_methods: Sequence[str], ) -> Dict[str, str]: - # If there is no label set if job_configs_labels is None: job_configs_labels = {} diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 48a28f4f3a..3bd4169977 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -1,3 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from bigframes.core import log_adapter From b0adf27126cfaf9a6ec524072a150b30fc7998b9 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 19:58:37 +0000 Subject: [PATCH 29/32] address comments --- bigframes/core/log_adapter.py | 5 ----- bigframes/session/_io/bigquery.py | 1 - tests/unit/core/test_log_adapter.py | 18 +++++++++++++++++- tests/unit/session/test_io_bigquery.py | 13 +++++-------- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index ee8ff8020b..b2bf2c999c 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -38,11 +38,6 @@ def wrapper(*args, **kwargs): # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): add_api_method(api_method_name) - try: - result = method(*args, **kwargs) - return result - except Exception as e: - raise e return wrapper diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index b2a6ea3472..dae73301e7 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -34,7 +34,6 @@ def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], api_methods: Sequence[str], ) -> Dict[str, str]: - # If there is no label set if job_configs_labels is None: job_configs_labels = {} diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 48a28f4f3a..376b7f2075 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -1,3 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from bigframes.core import log_adapter @@ -25,7 +39,9 @@ def test_method_logging(test_instance): # Check if the methods were added to the _api_methods list api_methods = log_adapter.get_and_reset_api_methods() - assert api_methods == ["method2", "method1"] + assert api_methods is not None + assert "method1" in api_methods + assert "method2" in api_methods def test_add_api_method_limit(test_instance): diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 5ceee44084..1842e718c2 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -75,14 +75,12 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): expected_dict = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "recent-bigframes-api-0": "__init__", - "recent-bigframes-api-1": "max", + "recent-bigframes-api-0": "max", + "recent-bigframes-api-1": "head", "recent-bigframes-api-2": "__init__", - "recent-bigframes-api-3": "head", - "recent-bigframes-api-4": "__init__", } assert labels is not None - assert len(labels) == 7 + assert len(labels) == 5 assert labels == expected_dict @@ -99,7 +97,6 @@ def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): assert labels is not None assert len(labels) == 64 assert "head" in labels.values() - assert "__init__" in labels.values() def test_create_job_configs_labels_length_limit_met(): @@ -114,6 +111,7 @@ def test_create_job_configs_labels_length_limit_met(): # If cur_labels length is 62, we can only add one label from api_methods df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) # Test running two methods + df.agg() df.head() df.max() api_methods = log_adapter._api_methods @@ -124,8 +122,7 @@ def test_create_job_configs_labels_length_limit_met(): assert labels is not None assert len(labels) == 64 assert "max" in labels.values() - assert "__init__" in labels.values() - assert "head" not in labels.values() + assert "agg" not in labels.values() assert "bigframes-api" in labels.keys() assert "source" in labels.keys() From df9c9c03686da8b2abba88a2210b5f144a900357 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 20:30:31 +0000 Subject: [PATCH 30/32] fix error --- bigframes/core/log_adapter.py | 1 + tests/unit/session/test_io_bigquery.py | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index b2bf2c999c..b790d19562 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -38,6 +38,7 @@ def wrapper(*args, **kwargs): # Track regular and "dunder" methods if api_method_name.startswith("__") or not api_method_name.startswith("_"): add_api_method(api_method_name) + return method(*args, **kwargs) return wrapper diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 1842e718c2..e1481d3f05 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -75,12 +75,14 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): expected_dict = { "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", - "recent-bigframes-api-0": "max", - "recent-bigframes-api-1": "head", + "recent-bigframes-api-0": "__init__", + "recent-bigframes-api-1": "max", "recent-bigframes-api-2": "__init__", + "recent-bigframes-api-3": "head", + "recent-bigframes-api-4": "__init__", } assert labels is not None - assert len(labels) == 5 + assert len(labels) == 7 assert labels == expected_dict @@ -111,7 +113,6 @@ def test_create_job_configs_labels_length_limit_met(): # If cur_labels length is 62, we can only add one label from api_methods df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) # Test running two methods - df.agg() df.head() df.max() api_methods = log_adapter._api_methods @@ -122,7 +123,7 @@ def test_create_job_configs_labels_length_limit_met(): assert labels is not None assert len(labels) == 64 assert "max" in labels.values() - assert "agg" not in labels.values() + assert "head" not in labels.values() assert "bigframes-api" in labels.keys() assert "source" in labels.keys() From 00bb6de5dc35aaa122a3c832fdcbcea3e5d00286 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 21:31:56 +0000 Subject: [PATCH 31/32] fix None job_config error --- bigframes/session/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index adc9b4375f..069bd5d260 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1386,6 +1386,8 @@ def _prepare_job_config( ) -> bigquery.QueryJobConfig: if job_config is None: job_config = self.bqclient.default_query_job_config + if job_config is None: + job_config = bigquery.QueryJobConfig() if bigframes.options.compute.maximum_bytes_billed is not None: job_config.maximum_bytes_billed = ( bigframes.options.compute.maximum_bytes_billed From 36fea0681fce9fb9b7dd99369b889438975a9367 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 14 Nov 2023 22:39:07 +0000 Subject: [PATCH 32/32] address comments --- tests/unit/test_compute_options.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py index 3b3bbca8c6..a613bca7b9 100644 --- a/tests/unit/test_compute_options.py +++ b/tests/unit/test_compute_options.py @@ -18,14 +18,9 @@ def test_maximum_bytes_option(): session = resources.create_bigquery_session() - num_query_calls = 0 with bf.option_context("compute.maximum_bytes_billed", 10000): - # clear initial method calls - session.bqclient.method_calls = [] + session.bqclient.query.reset_mock() session._start_query("query") - for call in session.bqclient.method_calls: - _, _, kwargs = call - num_query_calls += 1 - if "job_config" in kwargs: - assert kwargs["job_config"].maximum_bytes_billed == 10000 - assert num_query_calls > 0 + call = session.bqclient.query.call_args + assert call.kwargs["job_config"].maximum_bytes_billed == 10000 + session.bqclient.query.assert_called_once()