From b09c746b207eb0af264ebfff5a13e41ecd0a85db Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Mon, 7 Aug 2023 16:43:21 +0000 Subject: [PATCH 01/10] feat: add `bigframes.options.compute.maximum_bytes_billed` option that sets maximum bytes billed on query jobs -implement context manager for global options -maximum_bytes_billed only applies to query jobs. This limitation will be set per query. Operations that trigger multiple jobs may result in total usage beyond this setting --- bigframes/__init__.py | 3 +- bigframes/_config/__init__.py | 11 +++++ bigframes/_config/compute_options.py | 35 +++++++++++++++ bigframes/_config/display_options.py | 23 ++++------ bigframes/pandas/__init__.py | 7 +++ bigframes/session/__init__.py | 19 +++++--- tests/system/conftest.py | 7 --- tests/system/small/test_progress_bar.py | 17 ++++--- tests/unit/test_compute_options.py | 15 +++++++ .../pandas/_config/config.py | 44 +++++++++++++++++++ 10 files changed, 145 insertions(+), 36 deletions(-) create mode 100644 bigframes/_config/compute_options.py create mode 100644 tests/unit/test_compute_options.py create mode 100644 third_party/bigframes_vendored/pandas/_config/config.py diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 8f41790072..bd1476957b 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,7 +14,7 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import options +from bigframes._config import option_context, options from bigframes._config.bigquery_options import BigQueryOptions from bigframes.core.global_session import close_session, get_global_session from bigframes.session import connect, Session @@ -28,4 +28,5 @@ "connect", "Session", "__version__", + "option_context", ] diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index e26eaf8800..8dcebfce6a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -18,8 +18,10 @@ """ import bigframes._config.bigquery_options as bigquery_options +import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options import bigframes._config.sampling_options as sampling_options +import third_party.bigframes_vendored.pandas._config.config as pandas_config class Options: @@ -29,6 +31,7 @@ def __init__(self): self._bigquery_options = bigquery_options.BigQueryOptions() self._display_options = display_options.DisplayOptions() self._sampling_options = sampling_options.SamplingOptions() + self._compute_options = compute_options.ComputeOptions() @property def bigquery(self) -> bigquery_options.BigQueryOptions: @@ -49,6 +52,11 @@ def sampling(self) -> sampling_options.SamplingOptions: parameters in specific functions.""" return self._sampling_options + @property + def compute(self) -> compute_options.ComputeOptions: + """Options controlling object computation.""" + return self._compute_options + options = Options() """Global options for default session.""" @@ -58,3 +66,6 @@ def sampling(self) -> sampling_options.SamplingOptions: "Options", "options", ) + + +option_context = pandas_config.option_context diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py new file mode 100644 index 0000000000..20c31d3906 --- /dev/null +++ b/bigframes/_config/compute_options.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Options for displaying objects.""" + +import dataclasses +from typing import Optional + + +@dataclasses.dataclass +class ComputeOptions: + """ + Encapsulates configuration for compute options. + + Attributes: + maximum_bytes_billed (int, Options): + Limits the bytes billed for query jobs. Queries that will have + bytes billed beyond this limit will fail (without incurring a + charge). If unspecified, this will be set to your project default. + See `maximum_bytes_billed `_. + + """ + + maximum_bytes_billed: Optional[int] = None diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 8bd2743f17..ad3ea3f68c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -40,17 +40,12 @@ def pandas_repr(display_options: DisplayOptions): This context manager makes sure we reset the pandas options when we're done so that we don't override pandas behavior. """ - original_max_cols = pd.options.display.max_columns - original_max_rows = pd.options.display.max_rows - original_show_dimensions = pd.options.display.show_dimensions - - pd.options.display.max_columns = display_options.max_columns - pd.options.display.max_rows = display_options.max_rows - pd.options.display.show_dimensions = True # type: ignore - - try: - yield - finally: - pd.options.display.max_columns = original_max_cols - pd.options.display.max_rows = original_max_rows - pd.options.display.show_dimensions = original_show_dimensions + with pd.option_context( + "display.max_columns", + display_options.max_columns, + "display.max_rows", + display_options.max_rows, + "display.show_dimensions", + True, + ) as pandas_context: + yield (pandas_context) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5c1928e6f0..e4a9da54c0 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -162,6 +162,13 @@ def merge( ) +options = config.options +"""Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" + +option_context = config.option_context +"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + + merge.__doc__ = vendored_pandas_merge.merge.__doc__ diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5ec3da1a5a..6cdb741c23 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1494,12 +1494,10 @@ def _start_query( max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts query job and waits for results + Starts query job and waits for results. """ - if job_config is not None: - query_job = self.bqclient.query(sql, job_config=job_config) - else: - query_job = self.bqclient.query(sql) + job_config = self._prepare_job_config(job_config) + query_job = self.bqclient.query(sql, job_config=job_config) opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: @@ -1532,6 +1530,17 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + def _prepare_job_config( + self, job_config: Optional[bigquery.QueryJobConfig] = None + ) -> bigquery.QueryJobConfig: + if job_config is None: + job_config = self.bqclient.default_query_job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + return job_config + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb664302a8..466d564d29 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -894,13 +894,6 @@ def usa_names_grouped_table( return session.bqclient.get_table(table_id) -@pytest.fixture() -def deferred_repr(): - bigframes.options.display.repr_mode = "deferred" - yield - bigframes.options.display.repr_mode = "head" - - @pytest.fixture() def restore_sampling_settings(): enable_downsampling = bigframes.options.sampling.enable_downsampling diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 00380c2639..74ae489e84 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -144,12 +144,11 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): assert string in query_job_repr -def test_query_job_dry_run( - penguins_df_default_index: bf.dataframe.DataFrame, capsys, deferred_repr -): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line +def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): + with bf.option_context("display.repr_mode", "deferred"): + repr(penguins_df_default_index) + repr(penguins_df_default_index["body_mass_g"]) + lines = capsys.readouterr().out.split("\n") + lines = filter(None, lines) + for line in lines: + assert "Computation deferred. Computation will process" in line diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py new file mode 100644 index 0000000000..a2131b8297 --- /dev/null +++ b/tests/unit/test_compute_options.py @@ -0,0 +1,15 @@ +import bigframes as bf + + +def test_maximum_bytes_option(session: bf.Session, mock_bigquery_client): + num_query_calls = 0 + with bf.option_context("compute.maximum_bytes_billed", 10000): + # clear initial method calls + mock_bigquery_client.method_calls = [] + session._start_query("query") + for call in mock_bigquery_client.method_calls: + name, _, kwargs = call + if name == "query": + num_query_calls = +1 + assert kwargs["job_config"].maximum_bytes_billed == 10000 + assert num_query_calls > 0 diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py new file mode 100644 index 0000000000..a9d5ef86c2 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -0,0 +1,44 @@ +import contextlib +import operator + +import bigframes + + +class option_context(contextlib.ContextDecorator): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + >>> import bigframes + >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass + """ + + def __init__(self, *args) -> None: + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self) -> None: + self.undo = [ + (pat, operator.attrgetter(pat)(bigframes.options)) for pat, val in self.ops + ] + + for pat, val in self.ops: + self._set_option(pat, val) + + def __exit__(self, *args) -> None: + if self.undo: + for pat, val in self.undo: + self._set_option(pat, val) + + def _set_option(self, pat, val): + root, attr = pat.rsplit(".", 1) + parent = operator.attrgetter(root)(bigframes.options) + setattr(parent, attr, val) From 4ba47f5a2878b29c02fba82019adb88db56daa57 Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Mon, 7 Aug 2023 16:43:21 +0000 Subject: [PATCH 02/10] feat: add `bigframes.options.compute.maximum_bytes_billed` option that sets maximum bytes billed on query jobs -implement context manager for global options -maximum_bytes_billed only applies to query jobs. This limitation will be set per query. Operations that trigger multiple jobs may result in total usage beyond this setting --- tests/unit/test_compute_options.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py index a2131b8297..a3bc1c100e 100644 --- a/tests/unit/test_compute_options.py +++ b/tests/unit/test_compute_options.py @@ -1,13 +1,29 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import bigframes as bf +from . import resources -def test_maximum_bytes_option(session: bf.Session, mock_bigquery_client): + +def test_maximum_bytes_option(): + session = resources.create_bigquery_session() num_query_calls = 0 with bf.option_context("compute.maximum_bytes_billed", 10000): # clear initial method calls - mock_bigquery_client.method_calls = [] + session.bqclient.method_calls = [] session._start_query("query") - for call in mock_bigquery_client.method_calls: + for call in session.bqclient.method_calls: name, _, kwargs = call if name == "query": num_query_calls = +1 From 704a2d70231ec74d94a3d31ed57825aa6470026e Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Tue, 24 Oct 2023 16:56:05 +0000 Subject: [PATCH 03/10] add liscence header --- third_party/bigframes_vendored/pandas/_config/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py index a9d5ef86c2..8abaca76c7 100644 --- a/third_party/bigframes_vendored/pandas/_config/config.py +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -1,3 +1,4 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/_config/config.py import contextlib import operator From f5dc41a7817f9fe25e8726ebf83f77de2e95a976 Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Fri, 27 Oct 2023 01:26:07 +0000 Subject: [PATCH 04/10] update test to pass coverage check --- tests/unit/test_compute_options.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py index a3bc1c100e..499a0a5fef 100644 --- a/tests/unit/test_compute_options.py +++ b/tests/unit/test_compute_options.py @@ -24,8 +24,7 @@ def test_maximum_bytes_option(): session.bqclient.method_calls = [] session._start_query("query") for call in session.bqclient.method_calls: - name, _, kwargs = call - if name == "query": - num_query_calls = +1 - assert kwargs["job_config"].maximum_bytes_billed == 10000 + _, _, kwargs = call + num_query_calls += 1 + assert kwargs["job_config"].maximum_bytes_billed == 10000 assert num_query_calls > 0 From ef291ffc6a61b36f04a57d8aea0333115a871b69 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:07:58 -0700 Subject: [PATCH 05/10] docs: add artithmetic df sample code (#153) * docs: add artithmetic df sample code * fix: address comments --- bigframes/session/__init__.py | 4 +- .../bigframes_vendored/pandas/core/frame.py | 494 +++++++++++++++++- 2 files changed, 492 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5862ba957f..4858c7726a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -352,7 +352,7 @@ def read_gbq_query( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Simple query input: + Simple query input: >>> df = bpd.read_gbq_query(''' ... SELECT @@ -368,7 +368,7 @@ def read_gbq_query( [2 rows x 3 columns] - Preserve ordering in a query input. + Preserve ordering in a query input. >>> df = bpd.read_gbq_query(''' ... SELECT diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 67836a8fd2..013d170114 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -697,6 +697,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'): @@ -978,9 +979,9 @@ def sort_values( Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - kind (str, default `quicksort`): - Choice of sorting algorithm. Accepts 'quicksort’, ‘mergesort’, - ‘heapsort’, ‘stable’. Ignored except when determining whether to + kind (str, default 'quicksort'): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default `last`): ``{'first', 'last'}``, default 'last' Puts NaNs at the beginning @@ -1014,6 +1015,29 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].eq(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``==``: + >>> df["degrees"] == 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1036,6 +1060,30 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ne(360) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``!=``: + + >>> df["degrees"] != 360 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1061,6 +1109,30 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].le(180) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<=``: + + >>> df["degrees"] <= 180 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1087,6 +1159,30 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].lt(180) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<``: + + >>> df["degrees"] < 180 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1113,6 +1209,30 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ge(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>=``: + + >>> df["degrees"] >= 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1139,6 +1259,28 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].gt(360) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>``: + + >>> df["degrees"] > 360 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1162,6 +1304,32 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].add(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + (df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1185,6 +1353,32 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].sub(df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + + You can also use arithmetic operator ``-``: + + >>> df['A'] - (df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1208,6 +1402,29 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rsub(df['B']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``-``: + + >>> df['B'] - (df['A']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1231,6 +1448,32 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1254,6 +1497,32 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].truediv(df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + + You can also use arithmetic operator ``/``: + + >>> df['A'] / (df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1277,6 +1546,29 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rtruediv(df['B']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + + It's equivalent to using arithmetic operator: ``/``: + + >>> df['B'] / (df['A']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1300,6 +1592,32 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].floordiv(df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + + You can also use arithmetic operator ``//``: + + >>> df['A'] // (df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1323,6 +1641,29 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rfloordiv(df['B']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``//``: + + >>> df['B'] // (df['A']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1346,6 +1687,32 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mod(df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + + You can also use arithmetic operator ``%``: + + >>> df['A'] % (df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + Args: other: Any single or multiple element data structure, or list-like object. @@ -1369,6 +1736,29 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rmod(df['B']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``%``: + + >>> df['B'] % (df['A']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1382,7 +1772,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pow(self, other, axis: str | int = "columns") -> DataFrame: - """Get Exponential power of dataframe and other, element-wise (binary operator `pow`). + """Get Exponential power of dataframe and other, element-wise (binary operator `**`). Equivalent to ``dataframe ** other``, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, `rpow`. @@ -1393,6 +1783,32 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].pow(df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + + You can also use arithmetic operator ``**``: + + >>> df['A'] ** (df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1417,6 +1833,29 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rpow(df['B']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``**``: + + >>> df['B'] ** (df['A']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1438,6 +1877,21 @@ def combine( to element-wise combine columns. The row and column indexes of the resulting DataFrame will be the union of the two. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + [2 rows x 2 columns] + Args: other (DataFrame): The DataFrame to merge column-wise. @@ -1468,6 +1922,20 @@ def combine_first(self, other) -> DataFrame: second.loc[index, col] are not missing values, upon calling first.combine_first(second). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + [2 rows x 2 columns] + Args: other (DataFrame): Provided DataFrame to use to fill null values. @@ -1485,6 +1953,24 @@ def update( Aligns on indices. There is no return value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = bpd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + Args: other (DataFrame, or object coercible into a DataFrame): Should have at least one matching index/column label From 73c77ba01974b4f036cd999b6d38d24343f6411a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 30 Oct 2023 18:56:14 +0000 Subject: [PATCH 06/10] feat: Implement operator `@` for `DataFrame.dot` (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 🦕 --- bigframes/dataframe.py | 2 ++ tests/system/small/test_dataframe.py | 33 +++++++++++++++++++++++++++ tests/system/small/test_multiindex.py | 16 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 49d7ad991a..3369fb4868 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2707,3 +2707,5 @@ def get_right_id(id): result = result[other.name].rename() return result + + __matmul__ = dot diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b503f9a31d..c96faa3526 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3264,6 +3264,23 @@ def test_df_dot( ) +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3278,3 +3295,19 @@ def test_df_dot_series( bf_result, pd_result, ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d6bf46f77c..bc35f633fd 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -998,6 +998,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + # right multi-index right_index = pandas.MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "bb")]) bf1 = bpd.DataFrame(left_matrix) @@ -1005,6 +1008,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + def test_column_multi_index_dot_not_supported(): left_matrix = [[1, 2, 3], [2, 5, 7]] @@ -1022,6 +1028,11 @@ def test_column_multi_index_dot_not_supported(): ): bf1.dot(bf2) + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 + # right multi-columns bf1 = bpd.DataFrame(left_matrix) bf2 = bpd.DataFrame(right_matrix, columns=multi_level_columns) @@ -1029,3 +1040,8 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1.dot(bf2) + + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 From a2f730de3e6fef21bc3417bbc7b76b6f42518743 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 31 Oct 2023 10:54:17 -0700 Subject: [PATCH 07/10] test: add code snippets for loading data from BigQuery Job (#154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add code snippets for loading data from BigQuery Job * fix: address the comments * fix: fix the broken test * use BigQuery Client library to get the job_id * feat: Implement operator `@` for `DataFrame.dot` (#139) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 🦕 * fix: fix the comments --------- Co-authored-by: Shobhit Singh --- .../load_data_from_biquery_job_test.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 samples/snippets/load_data_from_biquery_job_test.py diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py new file mode 100644 index 0000000000..5271574a49 --- /dev/null +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -0,0 +1,51 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery_job(): + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client(project="bigframes-dev", location="us") + + query = """ + SELECT * + FROM `bigquery-public-data.ml_datasets.penguins` + LIMIT 20 + """ + query_job = client.query(query) + JOB_ID = query_job.job_id + your_project_id = "bigframes-dev" + + # [START bigquery_dataframes_load_data_from_bigquery_job] + from google.cloud import bigquery + + import bigframes.pandas as bpd + + # Project ID inserted based on the query results selected to explore + project = your_project_id + # Location inserted based on the query results selected to explore + location = "us" + client = bigquery.Client(project=project, location=location) + + # Job ID inserted based on the query results selcted to explore + job_id = JOB_ID + job = client.get_job(job_id) + destination = str(job.destination) + + # Load data from a BigQuery table using BigFrames DataFrames: + bq_df = bpd.read_gbq_table(destination) + + # [END bigquery_dataframes_load_data_from_bigquery_job] + assert bq_df is not None From b797a99d36413dbc94557123d7d50dcdbfaf3bc8 Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Tue, 31 Oct 2023 18:23:05 +0000 Subject: [PATCH 08/10] add options_context to __all__ --- bigframes/pandas/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 8ec3b913ea..0fab1109dc 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -175,13 +175,6 @@ def merge( ) -options = config.options -"""Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" - -option_context = config.option_context -"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" - - merge.__doc__ = vendored_pandas_merge.merge.__doc__ @@ -469,6 +462,9 @@ def read_gbq_function(function_name: str): options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" +option_context = config.option_context +"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session @@ -501,6 +497,7 @@ def read_gbq_function(function_name: str): # Other public pandas attributes "NamedAgg", "options", + "option_context", # Session management APIs "get_global_session", "close_session", From ad2fd81eedbfe32329815ff6d67e5d3276b89984 Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Tue, 31 Oct 2023 19:01:07 +0000 Subject: [PATCH 09/10] add autoclass for compute options --- docs/reference/bigframes/options.rst | 2 ++ docs/templates/toc.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index d831a519fe..54ddd913bc 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -12,3 +12,5 @@ Options and settings .. autoclass:: bigframes._config.display_options.DisplayOptions .. autoclass:: bigframes._config.sampling_options.SamplingOptions + +.. autoclass:: bigframes._config.sampling_options.ComputeOptions diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 4fe2ec1a6a..9879721d28 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -13,6 +13,8 @@ uid: bigframes._config.display_options.DisplayOptions - name: SamplingOptions uid: bigframes._config.sampling_options.SamplingOptions + - name: ComputeOptions + uid: bigframes._config.compute_options.ComputeOptions name: Options and settings - items: - name: Session From d0a85b501e33b2f3ce4c4d84ef397e05d0080031 Mon Sep 17 00:00:00 2001 From: Bradford Orr Date: Tue, 31 Oct 2023 19:04:48 +0000 Subject: [PATCH 10/10] update bad autoclass config --- docs/reference/bigframes/options.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index 54ddd913bc..991399eb88 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -13,4 +13,4 @@ Options and settings .. autoclass:: bigframes._config.sampling_options.SamplingOptions -.. autoclass:: bigframes._config.sampling_options.ComputeOptions +.. autoclass:: bigframes._config.compute_options.ComputeOptions