diff --git a/.kokoro/continuous/notebook.cfg b/.kokoro/continuous/notebook.cfg index c14297019a..cc73c3bea4 100644 --- a/.kokoro/continuous/notebook.cfg +++ b/.kokoro/continuous/notebook.cfg @@ -6,11 +6,6 @@ env_vars: { value: "notebook" } -env_vars: { - key: "BENCHMARK_AND_PUBLISH" - value: "true" -} - env_vars: { key: "GOOGLE_CLOUD_PROJECT" value: "bigframes-testing" diff --git a/.kokoro/load/notebook.cfg b/.kokoro/load/notebook.cfg new file mode 100644 index 0000000000..c14297019a --- /dev/null +++ b/.kokoro/load/notebook.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "notebook" +} + +env_vars: { + key: "BENCHMARK_AND_PUBLISH" + value: "true" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-testing" +} diff --git a/CHANGELOG.md b/CHANGELOG.md index c398f17d43..55e295f06a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,37 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.22.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.21.0...v1.22.0) (2024-10-09) + + +### Features + +* Support regional endpoints for more bigquery locations ([#1061](https://github.com/googleapis/python-bigquery-dataframes/issues/1061)) ([45b672a](https://github.com/googleapis/python-bigquery-dataframes/commit/45b672a9a6359ec8c4755d94e63e5ae77a39754b)) +* Update LLM generators to warn user about model name instead of raising error. ([#1048](https://github.com/googleapis/python-bigquery-dataframes/issues/1048)) ([650d80d](https://github.com/googleapis/python-bigquery-dataframes/commit/650d80d1ad90927068cdb71efbfc548b416641a6)) + + +### Bug Fixes + +* Access MATERIALIZED_VIEW with read_gbq ([#1070](https://github.com/googleapis/python-bigquery-dataframes/issues/1070)) ([601e984](https://github.com/googleapis/python-bigquery-dataframes/commit/601e984aeb3ebf1dcf9cb3f1c34b7f0e4ec7cd16)) +* Correct zero row count in DataFrame from table view ([#1062](https://github.com/googleapis/python-bigquery-dataframes/issues/1062)) ([b536070](https://github.com/googleapis/python-bigquery-dataframes/commit/b53607015abb79be0aa5666681f1c53b5b1bc2b5)) +* Fix generic error message when entering an incorrect column name ([#1031](https://github.com/googleapis/python-bigquery-dataframes/issues/1031)) ([5ac217d](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac217d650bc4f5576ba2b6595a3c0b1d88813ad)) +* Make `explode` respect the index labels ([#1064](https://github.com/googleapis/python-bigquery-dataframes/issues/1064)) ([99ca0df](https://github.com/googleapis/python-bigquery-dataframes/commit/99ca0df90acbbd81197c9b6718b7de7e4dfb86cc)) +* Make invalid location warning case-insensitive ([#1044](https://github.com/googleapis/python-bigquery-dataframes/issues/1044)) ([b6cd55a](https://github.com/googleapis/python-bigquery-dataframes/commit/b6cd55afc49b522904a13a7fd34d40201d176588)) +* Remove palm2 test case from llm load test ([#1063](https://github.com/googleapis/python-bigquery-dataframes/issues/1063)) ([575a10a](https://github.com/googleapis/python-bigquery-dataframes/commit/575a10a7ba0fbac76867f02da1dd65355f00d7aa)) +* Show warning for unknown location set through .ctor ([#1052](https://github.com/googleapis/python-bigquery-dataframes/issues/1052)) ([02c2da7](https://github.com/googleapis/python-bigquery-dataframes/commit/02c2da733b834b99d8044f3c5cac3ac9a85802a6)) + + +### Performance Improvements + +* Reduce schema tracking overhead ([#1056](https://github.com/googleapis/python-bigquery-dataframes/issues/1056)) ([1c3879d](https://github.com/googleapis/python-bigquery-dataframes/commit/1c3879df2d6925e17e2cdca827db8ec919471f72)) +* Repr generates fewer queries ([#1046](https://github.com/googleapis/python-bigquery-dataframes/issues/1046)) ([d204603](https://github.com/googleapis/python-bigquery-dataframes/commit/d204603fdc024823421397dbe514f1f7ced1bc2c)) +* Speedup internal tree comparisons ([#1060](https://github.com/googleapis/python-bigquery-dataframes/issues/1060)) ([4379438](https://github.com/googleapis/python-bigquery-dataframes/commit/4379438fc4f44ea847fd2c00a82af544265a30d2)) + + +### Documentation + +* Add docstring return type section to BigQueryOptions class ([#964](https://github.com/googleapis/python-bigquery-dataframes/issues/964)) ([307385f](https://github.com/googleapis/python-bigquery-dataframes/commit/307385f5295ae6918e7d42dcca2c0e0c32e82446)) + ## [1.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.20.0...v1.21.0) (2024-10-02) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index ac58c19fa5..75f91b28d3 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -29,6 +29,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options +import bigframes._config.experiment_options as experiment_options import bigframes._config.sampling_options as sampling_options @@ -46,6 +47,9 @@ class ThreadLocalConfig(threading.local): compute_options: compute_options.ComputeOptions = field( default_factory=compute_options.ComputeOptions ) + experiment_options: experiment_options.ExperimentOptions = field( + default_factory=experiment_options.ExperimentOptions + ) class Options: @@ -122,6 +126,16 @@ def compute(self) -> compute_options.ComputeOptions: """ return self._local.compute_options + @property + def experiments(self) -> experiment_options.ExperimentOptions: + """Options controlling experiments + + Returns: + bigframes._config.experiment_options.ExperimentOptions: + Thread-local options for controlling experiments + """ + return self._local.experiment_options + @property def is_bigquery_thread_local(self) -> bool: """Indicator that we're using a thread-local session. diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 502f103bb5..2fdd7d6feb 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -36,26 +36,36 @@ UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value. Did you mean '{possibility}'?" -def _validate_location(value: Optional[str]): - - if value is None: - return - - if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS: - location = str(value) - possibility = min( - bigframes.constants.ALL_BIGQUERY_LOCATIONS, - key=lambda item: jellyfish.levenshtein_distance(location, item), - ) - warnings.warn( - UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility), - # There are many layers before we get to (possibly) the user's code: - # -> bpd.options.bigquery.location = "us-central-1" - # -> location.setter - # -> _validate_location - stacklevel=3, - category=bigframes.exceptions.UnknownLocationWarning, - ) +def _get_validated_location(value: Optional[str]) -> Optional[str]: + + if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS: + return value + + location = str(value) + + location_lowercase = location.lower() + if location_lowercase in bigframes.constants.BIGQUERY_REGIONS: + return location_lowercase + + location_uppercase = location.upper() + if location_uppercase in bigframes.constants.BIGQUERY_MULTIREGIONS: + return location_uppercase + + possibility = min( + bigframes.constants.ALL_BIGQUERY_LOCATIONS, + key=lambda item: jellyfish.levenshtein_distance(location, item), + ) + warnings.warn( + UNKNOWN_LOCATION_MESSAGE.format(location=location, possibility=possibility), + # There are many layers before we get to (possibly) the user's code: + # -> bpd.options.bigquery.location = "us-central-1" + # -> location.setter + # -> _get_validated_location + stacklevel=3, + category=bigframes.exceptions.UnknownLocationWarning, + ) + + return value def _validate_ordering_mode(value: str) -> bigframes.enums.OrderingMode: @@ -84,7 +94,7 @@ def __init__( ): self._credentials = credentials self._project = project - self._location = location + self._location = _get_validated_location(location) self._bq_connection = bq_connection self._use_regional_endpoints = use_regional_endpoints self._application_name = application_name @@ -101,6 +111,10 @@ def application_name(self) -> Optional[str]: The application name to amend to the user agent sent to Google APIs. The recommended format is ``"application-name/major.minor.patch_version"`` or ``"(gpn:PartnerName;)"`` for official Google partners. + + Returns: + None or str: + Application name as a string if exists; otherwise None. """ return self._application_name @@ -114,7 +128,12 @@ def application_name(self, value: Optional[str]): @property def credentials(self) -> Optional[google.auth.credentials.Credentials]: - """The OAuth2 credentials to use for this client.""" + """The OAuth2 credentials to use for this client. + + Returns: + None or google.auth.credentials.Credentials: + google.auth.credentials.Credentials if exists; otherwise None. + """ return self._credentials @credentials.setter @@ -128,6 +147,10 @@ def location(self) -> Optional[str]: """Default location for job, datasets, and tables. For more information, see https://cloud.google.com/bigquery/docs/locations BigQuery locations. + + Returns: + None or str: + Default location as a string; otherwise None. """ return self._location @@ -135,12 +158,16 @@ def location(self) -> Optional[str]: def location(self, value: Optional[str]): if self._session_started and self._location != value: raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="location")) - _validate_location(value) - self._location = value + self._location = _get_validated_location(value) @property def project(self) -> Optional[str]: - """Google Cloud project ID to use for billing and as the default project.""" + """Google Cloud project ID to use for billing and as the default project. + + Returns: + None or str: + Google Cloud project ID as a string; otherwise None. + """ return self._project @project.setter @@ -163,6 +190,10 @@ def bq_connection(self) -> Optional[str]: If this option isn't provided, or project or location aren't provided, session will use its default project/location/connection_id as default connection. + + Returns: + None or str: + Name of the BigQuery connection as a string; otherwise None. """ return self._bq_connection @@ -181,6 +212,12 @@ def skip_bq_connection_check(self) -> bool: connection (default or user-provided) does not exist, or it does not have necessary permissions set up to support BigQuery DataFrames operations, then a runtime error will be reported. + + Returns: + bool: + A boolean value, where True indicates a BigQuery connection is + not created or the connection does not have necessary + permissions set up; otherwise False. """ return self._skip_bq_connection_check @@ -196,13 +233,29 @@ def skip_bq_connection_check(self, value: bool): def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. - .. deprecated:: 0.13.0 - Use of regional endpoints is a feature in Preview and - available only in selected regions and projects. + .. note:: + Use of regional endpoints is a feature in Preview and available only + in regions "europe-west3", "europe-west9", "europe-west8", + "me-central2", "us-east4" and "us-west1". - Requires that ``location`` is set. For example, to connect to - asia-northeast1-bigquery.googleapis.com, specify - ``location='asia-northeast1'`` and ``use_regional_endpoints=True``. + .. deprecated:: 0.13.0 + Use of locational endpoints is available only in selected projects. + + Requires that ``location`` is set. For supported regions, for example + ``europe-west3``, you need to specify ``location='europe-west3'`` and + ``use_regional_endpoints=True``, and then BigQuery DataFrames would + connect to the BigQuery endpoint ``bigquery.europe-west3.rep.googleapis.com``. + For not supported regions, for example ``asia-northeast1``, when you + specify ``location='asia-northeast1'`` and ``use_regional_endpoints=True``, + a different endpoint (called locational endpoint, now deprecated, used + to provide weaker promise on the request remaining within the location + during transit) ``europe-west3-bigquery.googleapis.com`` would be used. + + Returns: + bool: + A boolean value, where True indicates that regional endpoints + would be used for BigQuery and BigQuery storage APIs; otherwise + global endpoints would be used. """ return self._use_regional_endpoints @@ -235,6 +288,10 @@ def kms_key_name(self) -> Optional[str]: Cloud KMS CryptoKey Encrypter/Decrypter IAM role in the key's project. For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role Assign the Encrypter/Decrypter. + + Returns: + None or str: + Name of the customer managed encryption key as a string; otherwise None. """ return self._kms_key_name @@ -247,7 +304,12 @@ def kms_key_name(self, value: str): @property def ordering_mode(self) -> Literal["strict", "partial"]: - """Controls whether total row order is always maintained for DataFrame/Series.""" + """Controls whether total row order is always maintained for DataFrame/Series. + + Returns: + Literal: + A literal string value of either strict or partial ordering mode. + """ return self._ordering_mode.value @ordering_mode.setter diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py new file mode 100644 index 0000000000..c39502eade --- /dev/null +++ b/bigframes/_config/experiment_options.py @@ -0,0 +1,36 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + + +class ExperimentOptions: + """ + Encapsulates the configration for experiments + """ + + def __init__(self): + self._semantic_operators = False + + @property + def semantic_operators(self) -> bool: + return self._semantic_operators + + @semantic_operators.setter + def semantic_operators(self, value: bool): + if value is True: + warnings.warn( + "Semantic operators are still under experiments, and are subject to change in the future." + ) + self._semantic_operators = value diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py index f4fa0928e1..ddb2a49713 100644 --- a/bigframes/_config/sampling_options.py +++ b/bigframes/_config/sampling_options.py @@ -33,14 +33,44 @@ class SamplingOptions: random_state: Optional[int] = None def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions: + """Configures the maximum download size for data sampling in MB + + Args: + max_rows (None or int): + An int value for the maximum row size. + + Returns: + bigframes._config.sampling_options.SamplingOptions: + The configuration for data sampling. + """ return SamplingOptions( max_rows, self.enable_downsampling, self.sampling_method, self.random_state ) def with_method(self, method: Literal["head", "uniform"]) -> SamplingOptions: + """Configures the downsampling algorithms to be chosen from + + Args: + method (None or Literal): + A literal string value of either head or uniform data sampling method. + + Returns: + bigframes._config.sampling_options.SamplingOptions: + The configuration for data sampling. + """ return SamplingOptions(self.max_download_size, True, method, self.random_state) def with_random_state(self, state: Optional[int]) -> SamplingOptions: + """Configures the seed for the uniform downsampling algorithm + + Args: + state (None or int): + An int value for the data sampling random state + + Returns: + bigframes._config.sampling_options.SamplingOptions: + The configuration for data sampling. + """ return SamplingOptions( self.max_download_size, self.enable_downsampling, @@ -49,6 +79,12 @@ def with_random_state(self, state: Optional[int]) -> SamplingOptions: ) def with_disabled(self) -> SamplingOptions: + """Configures whether to disable downsampling + + Returns: + bigframes._config.sampling_options.SamplingOptions: + The configuration for data sampling. + """ return SamplingOptions( self.max_download_size, False, self.sampling_method, self.random_state ) diff --git a/bigframes/constants.py b/bigframes/constants.py index 4d5b6b8eb3..13636a4484 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -22,9 +22,8 @@ DEFAULT_EXPIRATION = datetime.timedelta(days=7) # https://cloud.google.com/bigquery/docs/locations -ALL_BIGQUERY_LOCATIONS = frozenset( +BIGQUERY_REGIONS = frozenset( { - # regions "us-east5", "us-south1", "us-central1", @@ -68,18 +67,23 @@ "me-central1", "me-west1", "africa-south1", - # multi-regions + } +) +BIGQUERY_MULTIREGIONS = frozenset( + { "US", "EU", } ) +ALL_BIGQUERY_LOCATIONS = frozenset(BIGQUERY_REGIONS.union(BIGQUERY_MULTIREGIONS)) # https://cloud.google.com/storage/docs/regional-endpoints REP_ENABLED_BIGQUERY_LOCATIONS = frozenset( { - "me-central2", - "europe-west9", "europe-west3", + "europe-west9", + "europe-west8", + "me-central2", "us-east4", "us-west1", } diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index acab99f249..485a9d79a7 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -158,10 +158,6 @@ def session(self) -> Session: def schema(self) -> schemata.ArraySchema: return self.node.schema - @functools.cached_property - def _compiled_schema(self) -> schemata.ArraySchema: - return bigframes.core.compile.test_only_ibis_inferred_schema(self.node) - @property def explicitly_ordered(self) -> bool: # see BigFrameNode.explicitly_ordered @@ -229,6 +225,23 @@ def order_by(self, by: Sequence[OrderingExpression]) -> ArrayValue: def reversed(self) -> ArrayValue: return ArrayValue(nodes.ReversedNode(child=self.node)) + def slice( + self, start: Optional[int], stop: Optional[int], step: Optional[int] + ) -> ArrayValue: + if self.node.order_ambiguous and not (self.session._strictly_ordered): + warnings.warn( + "Window ordering may be ambiguous, this can cause unstable results.", + bigframes.exceptions.AmbiguousWindowWarning, + ) + return ArrayValue( + nodes.SliceNode( + self.node, + start=start, + stop=stop, + step=step if (step is not None) else 1, + ) + ) + def promote_offsets(self) -> Tuple[ArrayValue, str]: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. @@ -394,20 +407,6 @@ def project_window_op( output_name, ) - def _reproject_to_table(self) -> ArrayValue: - """ - Internal operators that projects the internal representation into a - new ibis table expression where each value column is a direct - reference to a column in that table expression. Needed after - some operations such as window operations that cannot be used - recursively in projections. - """ - return ArrayValue( - nodes.ReprojectOpNode( - child=self.node, - ) - ) - def relational_join( self, other: ArrayValue, diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 2c4991b629..785691edd6 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -196,8 +196,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: else: output_column_ids.append(column) - # Force reproject since used `skip_project_unsafe` perviously - block = block.select_columns(output_column_ids)._force_reproject() + block = block.select_columns(output_column_ids) return block.with_column_labels(original_labels) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9e245399cd..b0a8903e19 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1390,7 +1390,7 @@ def explode( expr, column_labels=self.column_labels, index_columns=self.index_columns, - index_labels=self.column_labels.names, + index_labels=self._index_labels, ) def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: @@ -1465,84 +1465,17 @@ def slice( self, start: typing.Optional[int] = None, stop: typing.Optional[int] = None, - step: typing.Optional[int] = None, - ) -> bigframes.core.blocks.Block: - if step is None: - step = 1 + step: int = 1, + ) -> Block: if step == 0: - raise ValueError("slice step cannot be zero") - if step < 0: - reverse_start = (-start - 1) if start else 0 - reverse_stop = (-stop - 1) if stop else None - reverse_step = -step - return self.reversed()._forward_slice( - reverse_start, reverse_stop, reverse_step - ) - return self._forward_slice(start or 0, stop, step) - - def _forward_slice(self, start: int = 0, stop=None, step: int = 1): - """Performs slice but only for positive step size.""" - if step <= 0: - raise ValueError("forward_slice only supports positive step size") - - use_postive_offsets = ( - (start > 0) - or ((stop is not None) and (stop >= 0)) - or ((step > 1) and (start >= 0)) - ) - use_negative_offsets = ( - (start < 0) or (stop and (stop < 0)) or ((step > 1) and (start < 0)) + raise ValueError("Slice step size must be non-zero") + return Block( + self.expr.slice(start, stop, step), + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self._index_labels, ) - block = self - - # only generate offsets that are used - positive_offsets = None - negative_offsets = None - - if use_postive_offsets: - block, positive_offsets = self.promote_offsets() - if use_negative_offsets: - block, negative_offsets = block.reversed().promote_offsets() - block = block.reversed() - - conditions = [] - if start != 0: - if start > 0: - assert positive_offsets - conditions.append(ops.ge_op.as_expr(positive_offsets, ex.const(start))) - else: - assert negative_offsets - conditions.append( - ops.le_op.as_expr(negative_offsets, ex.const(-start - 1)) - ) - if stop is not None: - if stop >= 0: - assert positive_offsets - conditions.append(ops.lt_op.as_expr(positive_offsets, ex.const(stop))) - else: - assert negative_offsets - conditions.append( - ops.gt_op.as_expr(negative_offsets, ex.const(-stop - 1)) - ) - if step > 1: - if start >= 0: - assert positive_offsets - start_diff = ops.sub_op.as_expr(positive_offsets, ex.const(start)) - else: - assert negative_offsets - start_diff = ops.sub_op.as_expr(negative_offsets, ex.const(-start + 1)) - step_cond = ops.eq_op.as_expr( - ops.mod_op.as_expr(start_diff, ex.const(step)), ex.const(0) - ) - conditions.append(step_cond) - - for cond in conditions: - block, cond_id = block.project_expr(cond) - block = block.filter_by_id(cond_id) - - return block.select_columns(self.value_columns) - # Using cache to optimize for Jupyter Notebook's behavior where both '__repr__' # and '__repr_html__' are called in a single display action, reducing redundant # queries. @@ -1557,10 +1490,11 @@ def retrieve_repr_request_results( Returns a tuple of the dataframe and the overall number of rows of the query. """ + # head caches full underlying expression, so row_count will be free after head_result = self.session._executor.head(self.expr, max_results) count = self.session._executor.get_row_count(self.expr) - arrow = self.session._executor.execute(self.expr).to_arrow_table() + arrow = head_result.to_arrow_table() df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) self._copy_index_to_pandas(df) return df, count, head_result.query_job @@ -2432,15 +2366,6 @@ def join( # Always sort mult-index join return join_multi_indexed(self, other, how=how, sort=sort) - def _force_reproject(self) -> Block: - """Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations.""" - return Block( - self._expr._reproject_to_table(), - index_columns=self.index_columns, - column_labels=self.column_labels, - index_labels=self.index.names, - ) - def is_monotonic_increasing( self, column_id: typing.Union[str, Sequence[str]] ) -> bool: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 0917097c70..fd1514d7b7 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -335,10 +335,6 @@ def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): ) return result if ordered else result.to_unordered() - @_compile_node.register - def compile_reproject(self, node: nodes.ReprojectOpNode, ordered: bool = True): - return self.compile_node(node.child, ordered)._reproject_to_table() - @_compile_node.register def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True): return self.compile_node(node.child, ordered).explode(node.column_ids) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5cb0e65729..2d351cf82d 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -88,6 +88,12 @@ def __getitem__( keys = list(key) else: keys = [key] + + bad_keys = [key for key in keys if key not in self._block.column_labels] + + if len(bad_keys) > 0: + raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") + columns = [ col_id for col_id, label in self._col_id_labels.items() if label in keys ] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index e65040686e..1d01936509 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,7 +20,7 @@ import functools import itertools import typing -from typing import Callable, Iterable, Sequence, Tuple +from typing import Callable, Iterable, Optional, Sequence, Tuple import google.cloud.bigquery as bq @@ -51,8 +51,8 @@ class Field: dtype: bigframes.dtypes.Dtype -@dataclass(frozen=True) -class BigFrameNode: +@dataclass(eq=False, frozen=True) +class BigFrameNode(abc.ABC): """ Immutable node for representing 2D typed array as a tree of operators. @@ -95,12 +95,30 @@ def session(self): return sessions[0] return None + def _as_tuple(self) -> Tuple: + """Get all fields as tuple.""" + return tuple(getattr(self, field.name) for field in fields(self)) + + def __hash__(self) -> int: + # Custom hash that uses cache to avoid costly recomputation + return self._cached_hash + + def __eq__(self, other) -> bool: + # Custom eq that tries to short-circuit full structural comparison + if not isinstance(other, self.__class__): + return False + if self is other: + return True + if hash(self) != hash(other): + return False + return self._as_tuple() == other._as_tuple() + # BigFrameNode trees can be very deep so its important avoid recalculating the hash from scratch # Each subclass of BigFrameNode should use this property to implement __hash__ # The default dataclass-generated __hash__ method is not cached @functools.cached_property - def _node_hash(self): - return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) + def _cached_hash(self): + return hash(self._as_tuple()) @property def roots(self) -> typing.Set[BigFrameNode]: @@ -109,10 +127,10 @@ def roots(self) -> typing.Set[BigFrameNode]: ) return set(roots) - # TODO: For deep trees, this can create a lot of overhead, maybe use zero-copy persistent datastructure? + # TODO: Store some local data lazily for select, aggregate nodes. @property @abc.abstractmethod - def fields(self) -> Tuple[Field, ...]: + def fields(self) -> Iterable[Field]: ... @property @@ -226,7 +244,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.transform_children(lambda x: x.prune(used_cols)) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class UnaryNode(BigFrameNode): child: BigFrameNode @@ -234,8 +252,8 @@ class UnaryNode(BigFrameNode): def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.child,) - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + @property + def fields(self) -> Iterable[Field]: return self.child.fields @property @@ -252,7 +270,38 @@ def order_ambiguous(self) -> bool: return self.child.order_ambiguous -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) +class SliceNode(UnaryNode): + """Logical slice node conditionally becomes limit or filter over row numbers.""" + + start: Optional[int] + stop: Optional[int] + step: int = 1 + + @property + def row_preserving(self) -> bool: + """Whether this node preserves input rows.""" + return False + + @property + def non_local(self) -> bool: + """ + Whether this node combines information across multiple rows instead of processing rows independently. + Used as an approximation for whether the expression may require shuffling to execute (and therefore be expensive). + """ + return True + + # these are overestimates, more accurate numbers available by converting to concrete limit or analytic+filter ops + @property + def variables_introduced(self) -> int: + return 2 + + @property + def relation_ops_created(self) -> int: + return 2 + + +@dataclass(frozen=True, eq=False) class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode @@ -285,12 +334,9 @@ def explicitly_ordered(self) -> bool: # Do not consider user pre-join ordering intent - they need to re-order post-join in unordered mode. return False - def __hash__(self): - return self._node_hash - - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - return tuple(itertools.chain(self.left_child.fields, self.right_child.fields)) + @property + def fields(self) -> Iterable[Field]: + return itertools.chain(self.left_child.fields, self.right_child.fields) @functools.cached_property def variables_introduced(self) -> int: @@ -320,7 +366,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.transform_children(lambda x: x.prune(new_used)) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ConcatNode(BigFrameNode): # TODO: Explcitly map column ids from each child children: Tuple[BigFrameNode, ...] @@ -345,13 +391,10 @@ def explicitly_ordered(self) -> bool: # Consider concat as an ordered operations (even though input frames may not be ordered) return True - def __hash__(self): - return self._node_hash - - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + @property + def fields(self) -> Iterable[Field]: # TODO: Output names should probably be aligned beforehand or be part of concat definition - return tuple( + return ( Field(bfet_ids.ColumnId(f"column_{i}"), field.dtype) for i, field in enumerate(self.children[0].fields) ) @@ -371,16 +414,13 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class FromRangeNode(BigFrameNode): # TODO: Enforce single-row, single column constraint start: BigFrameNode end: BigFrameNode step: int - def __hash__(self): - return self._node_hash - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -398,8 +438,10 @@ def explicitly_ordered(self) -> bool: return True @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - return (Field(bfet_ids.ColumnId("labels"), self.start.fields[0].dtype),) + def fields(self) -> Iterable[Field]: + return ( + Field(bfet_ids.ColumnId("labels"), next(iter(self.start.fields)).dtype), + ) @functools.cached_property def variables_introduced(self) -> int: @@ -419,7 +461,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # Input Nodex # TODO: Most leaf nodes produce fixed column names based on the datasource # They should support renaming -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class LeafNode(BigFrameNode): @property def roots(self) -> typing.Set[BigFrameNode]: @@ -451,7 +493,7 @@ class ScanList: items: typing.Tuple[ScanItem, ...] -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): feather_bytes: bytes data_schema: schemata.ArraySchema @@ -460,14 +502,11 @@ class ReadLocalNode(LeafNode): scan_list: ScanList session: typing.Optional[bigframes.session.Session] = None - def __hash__(self): - return self._node_hash - - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - return tuple(Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + @property + def fields(self) -> Iterable[Field]: + return (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) - @functools.cached_property + @property def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" return len(self.scan_list.items) + 1 @@ -508,6 +547,7 @@ class GbqTable: table_id: str = field() physical_schema: Tuple[bq.SchemaField, ...] = field() n_rows: int = field() + is_physically_stored: bool = field() cluster_cols: typing.Optional[Tuple[str, ...]] @staticmethod @@ -523,6 +563,7 @@ def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable: table_id=table.table_id, physical_schema=schema, n_rows=table.num_rows, + is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]), cluster_cols=None if table.clustering_fields is None else tuple(table.clustering_fields), @@ -545,7 +586,7 @@ class BigqueryDataSource: ## Put ordering in here or just add order_by node above? -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ReadTableNode(LeafNode): source: BigqueryDataSource # Subset of physical schema column @@ -568,12 +609,9 @@ def __post_init__(self): def session(self): return self.table_session - def __hash__(self): - return self._node_hash - - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - return tuple(Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + @property + def fields(self) -> Iterable[Field]: + return (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) @property def relation_ops_created(self) -> int: @@ -603,7 +641,7 @@ def variables_introduced(self) -> int: @property def row_count(self) -> typing.Optional[int]: - if self.source.sql_predicate is None: + if self.source.sql_predicate is None and self.source.table.is_physically_stored: return self.source.table.n_rows return None @@ -614,15 +652,12 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return ReadTableNode(self.source, new_scan_list, self.table_session) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class CachedTableNode(ReadTableNode): # The original BFET subtree that was cached # note: this isn't a "child" node. original_node: BigFrameNode = field() - def __hash__(self): - return self._node_hash - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: new_scan_list = ScanList( tuple(item for item in self.scan_list.items if item.id in used_cols) @@ -633,20 +668,19 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # Unary nodes -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class PromoteOffsetsNode(UnaryNode): col_id: bigframes.core.identifiers.ColumnId - def __hash__(self): - return self._node_hash - @property def non_local(self) -> bool: return True @property - def fields(self) -> Tuple[Field, ...]: - return (*self.child.fields, Field(self.col_id, bigframes.dtypes.INT_DTYPE)) + def fields(self) -> Iterable[Field]: + return itertools.chain( + self.child.fields, [Field(self.col_id, bigframes.dtypes.INT_DTYPE)] + ) @property def relation_ops_created(self) -> int: @@ -664,7 +698,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.transform_children(lambda x: x.prune(new_used)) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class FilterNode(UnaryNode): predicate: ex.Expression @@ -672,9 +706,6 @@ class FilterNode(UnaryNode): def row_preserving(self) -> bool: return False - def __hash__(self): - return self._node_hash - @property def variables_introduced(self) -> int: return 1 @@ -685,13 +716,10 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return FilterNode(pruned_child, self.predicate) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class OrderByNode(UnaryNode): by: Tuple[OrderingExpression, ...] - def __hash__(self): - return self._node_hash - @property def variables_introduced(self) -> int: return 0 @@ -714,14 +742,11 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return OrderByNode(pruned_child, self.by) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ReversedNode(UnaryNode): # useless field to make sure has distinct hash reversed: bool = True - def __hash__(self): - return self._node_hash - @property def variables_introduced(self) -> int: return 0 @@ -732,17 +757,14 @@ def relation_ops_created(self) -> int: return 0 -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class SelectionNode(UnaryNode): input_output_pairs: typing.Tuple[ typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ... ] - def __hash__(self): - return self._node_hash - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + def fields(self) -> Iterable[Field]: return tuple( Field(output, self.child.get_type(input.id)) for input, output in self.input_output_pairs @@ -770,7 +792,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return SelectionNode(pruned_child, pruned_selections) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ProjectionNode(UnaryNode): """Assigns new variables (without modifying existing ones)""" @@ -786,17 +808,17 @@ def __post_init__(self): # Cannot assign to existing variables - append only! assert all(name not in self.child.schema.names for _, name in self.assignments) - def __hash__(self): - return self._node_hash - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + def added_fields(self) -> Tuple[Field, ...]: input_types = self.child._dtype_lookup - new_fields = ( + return tuple( Field(id, bigframes.dtypes.dtype_for_etype(ex.output_type(input_types))) for ex, id in self.assignments ) - return (*self.child.fields, *new_fields) + + @property + def fields(self) -> Iterable[Field]: + return itertools.chain(self.child.fields, self.added_fields) @property def variables_introduced(self) -> int: @@ -817,7 +839,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # TODO: Merge RowCount into Aggregate Node? # Row count can be compute from table metadata sometimes, so it is a bit special. -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class RowCountNode(UnaryNode): @property def row_preserving(self) -> bool: @@ -827,8 +849,8 @@ def row_preserving(self) -> bool: def non_local(self) -> bool: return True - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + @property + def fields(self) -> Iterable[Field]: return (Field(bfet_ids.ColumnId("count"), bigframes.dtypes.INT_DTYPE),) @property @@ -840,7 +862,7 @@ def defines_namespace(self) -> bool: return True -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class AggregateNode(UnaryNode): aggregations: typing.Tuple[ typing.Tuple[ex.Aggregation, bigframes.core.identifiers.ColumnId], ... @@ -852,15 +874,12 @@ class AggregateNode(UnaryNode): def row_preserving(self) -> bool: return False - def __hash__(self): - return self._node_hash - @property def non_local(self) -> bool: return True @functools.cached_property - def fields(self) -> Tuple[Field, ...]: + def fields(self) -> Iterable[Field]: by_items = ( Field(ref.id, self.child.get_type(ref.id)) for ref in self.by_column_ids ) @@ -873,7 +892,7 @@ def fields(self) -> Tuple[Field, ...]: ) for agg, id in self.aggregations ) - return (*by_items, *agg_items) + return tuple(itertools.chain(by_items, agg_items)) @property def variables_introduced(self) -> int: @@ -902,7 +921,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return AggregateNode(pruned_child, pruned_aggs, self.by_column_ids, self.dropna) -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class WindowOpNode(UnaryNode): column_name: ex.DerefOp op: agg_ops.UnaryWindowOp @@ -911,18 +930,13 @@ class WindowOpNode(UnaryNode): never_skip_nulls: bool = False skip_reproject_unsafe: bool = False - def __hash__(self): - return self._node_hash - @property def non_local(self) -> bool: return True - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - input_type = self.child.get_type(self.column_name.id) - new_item_dtype = self.op.output_type(input_type) - return (*self.child.fields, Field(self.output_name, new_item_dtype)) + @property + def fields(self) -> Iterable[Field]: + return itertools.chain(self.child.fields, [self.added_field]) @property def variables_introduced(self) -> int: @@ -933,6 +947,12 @@ def relation_ops_created(self) -> int: # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window return 0 if self.skip_reproject_unsafe else 4 + @functools.cached_property + def added_field(self) -> Field: + input_type = self.child.get_type(self.column_name.id) + new_item_dtype = self.op.output_type(input_type) + return Field(self.output_name, new_item_dtype) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: if self.output_name not in used_cols: return self.child @@ -942,23 +962,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.transform_children(lambda x: x.prune(consumed_ids)) -# TODO: Remove this op -@dataclass(frozen=True) -class ReprojectOpNode(UnaryNode): - def __hash__(self): - return self._node_hash - - @property - def variables_introduced(self) -> int: - return 0 - - @property - def relation_ops_created(self) -> int: - # This op is not a real transformation, just a hint to the sql generator - return 0 - - -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class RandomSampleNode(UnaryNode): fraction: float @@ -970,16 +974,13 @@ def deterministic(self) -> bool: def row_preserving(self) -> bool: return False - def __hash__(self): - return self._node_hash - @property def variables_introduced(self) -> int: return 1 # TODO: Explode should create a new column instead of overriding the existing one -@dataclass(frozen=True) +@dataclass(frozen=True, eq=False) class ExplodeNode(UnaryNode): column_ids: typing.Tuple[ex.DerefOp, ...] @@ -987,12 +988,9 @@ class ExplodeNode(UnaryNode): def row_preserving(self) -> bool: return False - def __hash__(self): - return self._node_hash - - @functools.cached_property - def fields(self) -> Tuple[Field, ...]: - return tuple( + @property + def fields(self) -> Iterable[Field]: + return ( Field( field.id, bigframes.dtypes.arrow_dtype_to_bigframes_dtype( diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 095f537c21..d4e530fff3 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -16,13 +16,15 @@ import dataclasses import functools import itertools -from typing import Mapping, Optional, Sequence, Tuple +from typing import cast, Mapping, Optional, Sequence, Tuple import bigframes.core.expression as scalar_exprs +import bigframes.core.guid as guids import bigframes.core.identifiers as ids import bigframes.core.join_def as join_defs import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.tree_properties as traversals import bigframes.operations as ops Selection = Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] @@ -381,3 +383,172 @@ def common_selection_root( if r_node in l_nodes: return r_node return None + + +def replace_slice_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode: + # TODO: we want to pull up some slices into limit op if near root. + if isinstance(root, nodes.SliceNode): + root = root.transform_children(replace_slice_ops) + return convert_slice_to_filter(cast(nodes.SliceNode, root)) + else: + return root.transform_children(replace_slice_ops) + + +def get_simplified_slice(node: nodes.SliceNode): + """Attempts to simplify the slice.""" + row_count = traversals.row_count(node) + start, stop, step = node.start, node.stop, node.step + + if start is None: + start = 0 if step > 0 else -1 + if row_count and step > 0: + if start and start < 0: + start = row_count + start + if stop and stop < 0: + stop = row_count + stop + return start, stop, step + + +def convert_slice_to_filter(node: nodes.SliceNode): + start, stop, step = get_simplified_slice(node) + + # no-op (eg. df[::1]) + if ( + ((start == 0) or (start is None)) + and ((stop is None) or (stop == -1)) + and (step == 1) + ): + return node.child + # No filtering, just reverse (eg. df[::-1]) + if ((start is None) or (start == -1)) and (not stop) and (step == -1): + return nodes.ReversedNode(node.child) + # if start/stop/step are all non-negative, and do a simple predicate on forward offsets + if ((start is None) or (start >= 0)) and ((stop is None) or (stop >= 0)): + node_w_offset = add_offsets(node.child) + predicate = convert_simple_slice( + scalar_exprs.DerefOp(node_w_offset.col_id), start or 0, stop, step + ) + filtered = nodes.FilterNode(node_w_offset, predicate) + return drop_cols(filtered, (node_w_offset.col_id,)) + + # fallback cases, generate both forward and backward offsets + if step < 0: + forward_offsets = add_offsets(node.child) + reversed_offsets = add_offsets(nodes.ReversedNode(forward_offsets)) + dual_indexed = reversed_offsets + else: + reversed_offsets = add_offsets(nodes.ReversedNode(node.child)) + forward_offsets = add_offsets(nodes.ReversedNode(reversed_offsets)) + dual_indexed = forward_offsets + predicate = convert_complex_slice( + scalar_exprs.DerefOp(forward_offsets.col_id), + scalar_exprs.DerefOp(reversed_offsets.col_id), + start, + stop, + step, + ) + filtered = nodes.FilterNode(dual_indexed, predicate) + return drop_cols(filtered, (forward_offsets.col_id, reversed_offsets.col_id)) + + +def add_offsets(node: nodes.BigFrameNode) -> nodes.PromoteOffsetsNode: + # Allow providing custom id generator? + offsets_id = ids.ColumnId(guids.generate_guid()) + return nodes.PromoteOffsetsNode(node, offsets_id) + + +def drop_cols( + node: nodes.BigFrameNode, drop_cols: Tuple[ids.ColumnId, ...] +) -> nodes.SelectionNode: + # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient + selections = tuple( + (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols + ) + return nodes.SelectionNode(node, selections) + + +def convert_simple_slice( + offsets: scalar_exprs.Expression, + start: int = 0, + stop: Optional[int] = None, + step: int = 1, +) -> scalar_exprs.Expression: + """Performs slice but only for positive step size.""" + assert start >= 0 + assert (stop is None) or (stop >= 0) + + conditions = [] + if start > 0: + conditions.append(ops.ge_op.as_expr(offsets, scalar_exprs.const(start))) + if (stop is not None) and (stop >= 0): + conditions.append(ops.lt_op.as_expr(offsets, scalar_exprs.const(stop))) + if step > 1: + start_diff = ops.sub_op.as_expr(offsets, scalar_exprs.const(start)) + step_cond = ops.eq_op.as_expr( + ops.mod_op.as_expr(start_diff, scalar_exprs.const(step)), + scalar_exprs.const(0), + ) + conditions.append(step_cond) + + return merge_predicates(conditions) or scalar_exprs.const(True) + + +def convert_complex_slice( + forward_offsets: scalar_exprs.Expression, + reverse_offsets: scalar_exprs.Expression, + start: int, + stop: Optional[int], + step: int = 1, +) -> scalar_exprs.Expression: + conditions = [] + assert step != 0 + if start or ((start is not None) and step < 0): + if start > 0 and step > 0: + start_cond = ops.ge_op.as_expr(forward_offsets, scalar_exprs.const(start)) + elif start > 0 and step < 0: + start_cond = ops.le_op.as_expr(forward_offsets, scalar_exprs.const(start)) + elif start < 0 and step > 0: + start_cond = ops.le_op.as_expr( + reverse_offsets, scalar_exprs.const(-start - 1) + ) + else: + assert start < 0 and step < 0 + start_cond = ops.ge_op.as_expr( + reverse_offsets, scalar_exprs.const(-start - 1) + ) + conditions.append(start_cond) + if stop is not None: + if stop >= 0 and step > 0: + stop_cond = ops.lt_op.as_expr(forward_offsets, scalar_exprs.const(stop)) + elif stop >= 0 and step < 0: + stop_cond = ops.gt_op.as_expr(forward_offsets, scalar_exprs.const(stop)) + elif stop < 0 and step > 0: + stop_cond = ops.gt_op.as_expr( + reverse_offsets, scalar_exprs.const(-stop - 1) + ) + else: + assert (stop < 0) and (step < 0) + stop_cond = ops.lt_op.as_expr( + reverse_offsets, scalar_exprs.const(-stop - 1) + ) + conditions.append(stop_cond) + if step != 1: + if step > 1 and start >= 0: + start_diff = ops.sub_op.as_expr(forward_offsets, scalar_exprs.const(start)) + elif step > 1 and start < 0: + start_diff = ops.sub_op.as_expr( + reverse_offsets, scalar_exprs.const(-start + 1) + ) + elif step < 0 and start >= 0: + start_diff = ops.add_op.as_expr(forward_offsets, scalar_exprs.const(start)) + else: + assert step < 0 and start < 0 + start_diff = ops.add_op.as_expr( + reverse_offsets, scalar_exprs.const(-start + 1) + ) + step_cond = ops.eq_op.as_expr( + ops.mod_op.as_expr(start_diff, scalar_exprs.const(step)), + scalar_exprs.const(0), + ) + conditions.append(step_cond) + return merge_predicates(conditions) or scalar_exprs.const(True) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d9f7cb9f42..0cfa5a2154 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -74,6 +74,7 @@ import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops import bigframes.operations.plotting as plotting +import bigframes.operations.semantics import bigframes.operations.structs import bigframes.series import bigframes.series as bf_series @@ -689,7 +690,6 @@ def _repr_html_(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self._compute_dry_run()) - self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -3722,7 +3722,9 @@ def _slice( stop: typing.Optional[int] = None, step: typing.Optional[int] = None, ) -> DataFrame: - block = self._block.slice(start=start, stop=stop, step=step) + block = self._block.slice( + start=start, stop=stop, step=step if (step is not None) else 1 + ) return DataFrame(block) def __array_ufunc__( @@ -3875,3 +3877,7 @@ def _throw_if_null_index(self, opname: str): raise bigframes.exceptions.NullIndexError( f"DataFrame cannot perform {opname} as it has no index. Set an index using set_index." ) + + @property + def semantics(self): + return bigframes.operations.semantics.Semantics(self) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index c12da01b54..3920da6c71 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -83,6 +83,13 @@ _ML_EMBED_TEXT_STATUS = "ml_embed_text_status" _ML_GENERATE_EMBEDDING_STATUS = "ml_generate_embedding_status" +_MODEL_NOT_SUPPORTED_WARNING = ( + "Model name '{model_name}' is not supported. " + "We are currently aware of the following models: {known_models}. " + "However, model names can change, and the supported models may be outdated. " + "You should use this model name only if you are sure that it is supported in BigQuery." +) + @typing_extensions.deprecated( "PaLM2TextGenerator is going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", @@ -154,8 +161,11 @@ def _create_bqml_model(self): ) if self.model_name not in _TEXT_GENERATOR_ENDPOINTS: - raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_TEXT_GENERATOR_ENDPOINTS)}." + warnings.warn( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_TEXT_GENERATOR_ENDPOINTS), + ) ) options = { @@ -484,8 +494,11 @@ def _create_bqml_model(self): ) if self.model_name not in _PALM2_EMBEDDING_GENERATOR_ENDPOINTS: - raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS)}." + warnings.warn( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_PALM2_EMBEDDING_GENERATOR_ENDPOINTS), + ) ) endpoint = ( @@ -644,8 +657,11 @@ def _create_bqml_model(self): ) if self.model_name not in _TEXT_EMBEDDING_ENDPOINTS: - raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_TEXT_EMBEDDING_ENDPOINTS)}." + warnings.warn( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_TEXT_EMBEDDING_ENDPOINTS), + ) ) options = { @@ -801,8 +817,11 @@ def _create_bqml_model(self): ) if self.model_name not in _GEMINI_ENDPOINTS: - raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_GEMINI_ENDPOINTS)}." + warnings.warn( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_GEMINI_ENDPOINTS), + ) ) options = {"endpoint": self.model_name} @@ -1118,8 +1137,11 @@ def _create_bqml_model(self): ) if self.model_name not in _CLAUDE_3_ENDPOINTS: - raise ValueError( - f"Model name {self.model_name} is not supported. We only support {', '.join(_CLAUDE_3_ENDPOINTS)}." + warnings.warn( + _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=", ".join(_CLAUDE_3_ENDPOINTS), + ) ) options = { diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py new file mode 100644 index 0000000000..9ff7ea38b2 --- /dev/null +++ b/bigframes/operations/semantics.py @@ -0,0 +1,762 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re +import typing +from typing import List, Optional + +import bigframes +import bigframes.core.guid +import bigframes.dtypes as dtypes + + +class Semantics: + def __init__(self, df) -> None: + if not bigframes.options.experiments.semantic_operators: + raise NotImplementedError() + + self._df = df + + def agg( + self, + instruction: str, + model, + cluster_column: typing.Optional[str] = None, + max_agg_rows: int = 10, + ): + """ + Performs an aggregation over all rows of the table. + + This method recursively aggregates the input data to produce partial answers + in parallel, until a single answer remains. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame( + ... { + ... "Movies": [ + ... "Titanic", + ... "The Wolf of Wall Street", + ... "Inception", + ... ], + ... "Year": [1997, 2013, 2010], + ... }) + >>> df.semantics.agg( + ... "Find the first name shared by all actors in {Movies}. One word answer.", + ... model=model, + ... ) + 0 Leonardo + + Name: Movies, dtype: string + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name enclosed in braces. + For example, to reference a column named "movies", use "{movies}" in the + instruction, like: "Find actor names shared by all {movies}." + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by the Bigframes ML package. + + cluster_column (Optional[str], default None): + If set, aggregates each cluster before performing aggregations across + clusters. Clustering based on semantic similarity can improve accuracy + of the sementic aggregations. + + max_agg_rows (int, default 10): + The maxinum number of rows to be aggregated at a time. + + Returns: + bigframes.dataframe.DataFrame: A new DataFrame with the aggregated answers. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when + more than one columns are referred to. + """ + self._validate_model(model) + + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + if len(columns) > 1: + raise NotImplementedError( + "Semantic aggregations are limited to a single column." + ) + column = columns[0] + + if max_agg_rows <= 1: + raise ValueError( + f"Invalid value for `max_agg_rows`: {max_agg_rows}." + "It must be greater than 1." + ) + + import bigframes.bigquery as bbq + import bigframes.dataframe + import bigframes.series + + df: bigframes.dataframe.DataFrame = self._df.copy() + user_instruction = self._format_instruction(instruction, columns) + + num_cluster = 1 + if cluster_column is not None: + if cluster_column not in df.columns: + raise ValueError(f"Cluster column `{cluster_column}` not found.") + + if df[cluster_column].dtype != dtypes.INT_DTYPE: + raise TypeError( + "Cluster column must be an integer type, not " + f"{type(df[cluster_column])}" + ) + + num_cluster = len(df[cluster_column].unique()) + df = df.sort_values(cluster_column) + else: + cluster_column = bigframes.core.guid.generate_guid("pid") + df[cluster_column] = 0 + + aggregation_group_id = bigframes.core.guid.generate_guid("agg") + group_row_index = bigframes.core.guid.generate_guid("gid") + llm_prompt = bigframes.core.guid.generate_guid("prompt") + df = ( + df.reset_index(drop=True) + .reset_index() + .rename(columns={"index": aggregation_group_id}) + ) + + output_instruction = ( + "Answer user instructions using the provided context from various sources. " + "Combine all relevant information into a single, concise, well-structured response. " + f"Instruction: {user_instruction}.\n\n" + ) + + while len(df) > 1: + df[group_row_index] = (df[aggregation_group_id] % max_agg_rows + 1).astype( + dtypes.STRING_DTYPE + ) + df[aggregation_group_id] = (df[aggregation_group_id] / max_agg_rows).astype( + dtypes.INT_DTYPE + ) + df[llm_prompt] = "\t\nSource #" + df[group_row_index] + ": " + df[column] + + if len(df) > num_cluster: + # Aggregate within each partition + agg_df = bbq.array_agg( + df.groupby(by=[cluster_column, aggregation_group_id]) + ) + else: + # Aggregate cross partitions + agg_df = bbq.array_agg(df.groupby(by=[aggregation_group_id])) + agg_df[cluster_column] = agg_df[cluster_column].list[0] + + # Skip if the aggregated group only has a single item + single_row_df: bigframes.series.Series = bbq.array_to_string( + agg_df[agg_df[group_row_index].list.len() <= 1][column], + delimiter="", + ) + prompt_s: bigframes.series.Series = bbq.array_to_string( + agg_df[agg_df[group_row_index].list.len() > 1][llm_prompt], + delimiter="", + ) + prompt_s = output_instruction + prompt_s # type:ignore + + # Run model + predict_df = typing.cast( + bigframes.dataframe.DataFrame, model.predict(prompt_s) + ) + agg_df[column] = predict_df["ml_generate_text_llm_result"].combine_first( + single_row_df + ) + + agg_df = agg_df.reset_index() + df = agg_df[[aggregation_group_id, cluster_column, column]] + + return df[column] + + def cluster_by( + self, + column: str, + output_column: str, + model, + n_clusters: int = 5, + ): + """ + Clusters data based on the semantic similarity of text within a specified column. + + This method leverages a language model to generate text embeddings for each value in + the given column. These embeddings capture the semantic meaning of the text. + The data is then grouped into `n` clusters using the k-means clustering algorithm, + which groups data points based on the similarity of their embeddings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator() + + >>> df = bpd.DataFrame({ + ... "Product": ["Smartphone", "Laptop", "T-shirt", "Jeans"], + ... }) + >>> df.semantics.cluster_by("Product", "Cluster ID", model, n_clusters=2) + Product Cluster ID + 0 Smartphone 2 + 1 Laptop 2 + 2 T-shirt 1 + 3 Jeans 1 + + [4 rows x 2 columns] + + Args: + column (str): + An column name to perform the similarity clustering. + + output_column (str): + An output column to store the clustering ID. + + model (bigframes.ml.llm.TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + + n_clusters (int, default 5): + Default 5. Number of clusters to be detected. + + Returns: + bigframes.dataframe.DataFrame: A new DataFrame with the clustering output column. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the column refers to a non-existing column. + """ + + import bigframes.dataframe + import bigframes.ml.cluster as cluster + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + if n_clusters <= 1: + raise ValueError( + f"Invalid value for `n_clusters`: {n_clusters}." + "It must be greater than 1." + ) + + df: bigframes.dataframe.DataFrame = self._df.copy() + embeddings_df = model.predict(df[column]) + + cluster_model = cluster.KMeans(n_clusters=n_clusters) + cluster_model.fit(embeddings_df[["ml_generate_embedding_result"]]) + clustered_result = cluster_model.predict(embeddings_df) + df[output_column] = clustered_result["CENTROID_ID"] + return df + + def filter(self, instruction: str, model): + """ + Filters the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}) + >>> df.semantics.filter("{city} is the capital of {country}", model) + country city + 1 Germany Berlin + + [1 rows x 2 columns] + + Args: + instruction: + An instruction on how to filter the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "The {food} is healthy." + + model: + A GeminiTextGenerator provided by Bigframes ML package. + + Returns: + DataFrame filtered by the instruction. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = "Based on the provided context, reply to the following claim by only True or False:" + + from bigframes.dataframe import DataFrame + + results = typing.cast( + DataFrame, + model.predict( + self._make_prompt(columns, user_instruction, output_instruction) + ), + ) + + return self._df[ + results["ml_generate_text_llm_result"].str.lower().str.contains("true") + ] + + def map(self, instruction: str, output_column: str, model): + """ + Maps the DataFrame with the semantics of the user instruction. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) + >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + ingredient_1 ingredient_2 food + 0 Burger Bun Beef Patty Burger + + 1 Soy Bean Bittern Tofu + + + [2 rows x 3 columns] + + Args: + instruction: + An instruction on how to map the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "food", you can refer to this column + in the instructions like: + "Get the ingredients of {food}." + + result_column_name: + The column name of the mapping result. + + model: + A GeminiTextGenerator provided by Bigframes ML package. + + Returns: + DataFrame with attached mapping results. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + + user_instruction = self._format_instruction(instruction, columns) + output_instruction = ( + "Based on the provided contenxt, answer the following instruction:" + ) + + from bigframes.series import Series + + results = typing.cast( + Series, + model.predict( + self._make_prompt(columns, user_instruction, output_instruction) + )["ml_generate_text_llm_result"], + ) + + from bigframes.core.reshape import concat + + return concat([self._df, results.rename(output_column)], axis=1) + + def join(self, other, instruction: str, model, max_rows: int = 1000): + """ + Joines two dataframes by applying the instruction over each pair of rows from + the left and right table. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']}) + >>> continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']}) + + >>> cities.semantics.join(continents, "{city} is in {continent}", model) + city continent + 0 Seattle North America + 1 Ottawa North America + 2 Shanghai Asia + 3 New Delhi Asia + + [4 rows x 2 columns] + + Args: + other: + The other dataframe. + + instruction: + An instruction on how left and right rows can be joined. This value must contain + column references by name. which should be wrapped in a pair of braces. + For example: "The {city} belongs to the {country}". + For column names that are shared between two dataframes, you need to add "_left" + and "_right" suffix for differentiation. This is especially important when you do + self joins. For example: "The {employee_name_left} reports to {employee_name_right}" + You must not add "_left" or "_right" suffix to non-overlapping columns. + + model: + A GeminiTextGenerator provided by Bigframes ML package. + + max_rows: + The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method + call will end early with an error. + + Returns: + The joined dataframe. + + Raises: + ValueError if the amount of data that will be sent for LLM processing is larger than max_rows. + """ + self._validate_model(model) + columns = self._parse_columns(instruction) + + joined_table_rows = len(self._df) * len(other) + + if joined_table_rows > max_rows: + raise ValueError( + f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." + ) + + left_columns = [] + right_columns = [] + + for col in columns: + if col in self._df.columns and col in other.columns: + raise ValueError(f"Ambiguous column reference: {col}") + + elif col in self._df.columns: + left_columns.append(col) + + elif col in other.columns: + right_columns.append(col) + + elif col.endswith("_left"): + original_col_name = col[: -len("_left")] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + left_columns.append(col) + elif original_col_name in self._df.columns: + raise ValueError(f"Unnecessary suffix for {col}") + else: + raise ValueError(f"Column {col} not found") + + elif col.endswith("_right"): + original_col_name = col[: -len("_right")] + if ( + original_col_name in self._df.columns + and original_col_name in other.columns + ): + right_columns.append(col) + elif original_col_name in other.columns: + raise ValueError(f"Unnecessary suffix for {col}") + else: + raise ValueError(f"Column {col} not found") + + else: + raise ValueError(f"Column {col} not found") + + if not left_columns or not right_columns: + raise ValueError() + + joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right")) + + return joined_df.semantics.filter(instruction, model).reset_index(drop=True) + + def search( + self, + search_column: str, + query: str, + top_k: int, + model, + score_column: Optional[str] = None, + ): + """ + Performs semantic search on the DataFrame. + + ** Examples: ** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import bigframes + >>> bigframes.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004") + + >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) + >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') + creatures distance + 3 chimpanzee 0.781101 + + [1 rows x 2 columns] + + Args: + search_column: + The name of the column to search from. + query (str): + The search query. + top_k (int): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + + Returns: + DataFrame: the DataFrame with the search result. + + Raises: + ValueError: when the search_column is not found from the the data frame. + TypeError: when the provided model is not TextEmbeddingGenerator. + """ + + if search_column not in self._df.columns: + raise ValueError(f"Column {search_column} not found") + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + embedded_df = model.predict(self._df[search_column]) + embedded_table = embedded_df.reset_index().to_gbq() + + import bigframes.pandas as bpd + + embedding_result_column = "ml_generate_embedding_result" + query_df = model.predict(bpd.DataFrame({"query_id": [query]})).rename( + columns={"content": "query_id", embedding_result_column: "embedding"} + ) + + import bigframes.bigquery as bbq + + search_result = ( + bbq.vector_search( + base_table=embedded_table, + column_to_search=embedding_result_column, + query=query_df, + top_k=top_k, + ) + .rename(columns={"content": search_column}) + .set_index("index") + ) + + search_result.index.name = self._df.index.name + + if score_column is not None: + search_result = search_result.rename(columns={"distance": score_column})[ + [search_column, score_column] + ] + else: + search_result = search_result[[search_column]] + + import bigframes.dataframe + + return typing.cast(bigframes.dataframe.DataFrame, search_result) + + def sim_join( + self, + other, + left_on: str, + right_on: str, + model, + top_k: int = 3, + score_column: Optional[str] = None, + max_rows: int = 1000, + ): + """ + Joins two dataframes based on the similarity of the specified columns. + + This method uses BigQuery's VECTOR_SEARCH function to match rows on the left side with the rows that have + nearest embedding vectors on the right. In the worst case scenario, the complexity is around O(M * N * log K). + Therefore, this is a potentially expensive operation. + + ** Examples: ** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import bigframes + >>> bigframes.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004") + + >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) + >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) + + >>> df1.semantics.sim_join(df2, left_on='animal', right_on='animal', model=model, top_k=1) + animal animal_1 + 0 monkey baboon + 1 spider scorpion + + [2 rows x 2 columns] + + Args: + other (DataFrame): + The other data frame to join with. + left_on (str): + The name of the column on left side for the join. + right_on (str): + The name of the column on the right side for the join. + top_k (int, default 3): + The number of nearest neighbors to return. + model (TextEmbeddingGenerator): + A TextEmbeddingGenerator provided by Bigframes ML package. + score_column (Optional[str], default None): + The name of the the additional column containning the similarity scores. If None, + this column won't be attached to the result. + max_rows: + The maximum number of rows allowed to be processed per call. If the result is too large, the method + call will end early with an error. + + Returns: + DataFrame: the data frame with the join result. + + Raises: + ValueError: when the amount of data to be processed exceeds the specified max_rows. + """ + + if left_on not in self._df.columns: + raise ValueError(f"Left column {left_on} not found") + if right_on not in self._df.columns: + raise ValueError(f"Right column {right_on} not found") + + import bigframes.ml.llm as llm + + if not isinstance(model, llm.TextEmbeddingGenerator): + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") + + joined_table_rows = len(self._df) * len(other) + if joined_table_rows > max_rows: + raise ValueError( + f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." + ) + + base_table_embedding_column = bigframes.core.guid.generate_guid() + base_table = self._attach_embedding( + other, right_on, base_table_embedding_column, model + ).to_gbq() + query_table = self._attach_embedding(self._df, left_on, "embedding", model) + + import bigframes.bigquery as bbq + + join_result = bbq.vector_search( + base_table=base_table, + column_to_search=base_table_embedding_column, + query=query_table, + top_k=top_k, + ) + + join_result = join_result.drop( + ["embedding", base_table_embedding_column], axis=1 + ) + + if score_column is not None: + join_result = join_result.rename(columns={"distance": score_column}) + else: + del join_result["distance"] + + return join_result + + @staticmethod + def _attach_embedding(dataframe, source_column: str, embedding_column: str, model): + result_df = dataframe.copy() + embeddings = model.predict(dataframe[source_column])[ + "ml_generate_embedding_result" + ] + result_df[embedding_column] = embeddings + return result_df + + def _make_prompt( + self, columns: List[str], user_instruction: str, output_instruction: str + ): + prompt_df = self._df[columns].copy() + prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: " + + # Combine context from multiple columns. + for col in columns: + prompt_df["prompt"] += f"{col} is `" + prompt_df[col] + "`\n" + + return prompt_df["prompt"] + + def _parse_columns(self, instruction: str) -> List[str]: + """Extracts column names enclosed in curly braces from the user instruction. + For example, _parse_columns("{city} is in {continent}") == ["city", "continent"] + """ + columns = re.findall(r"(? str: + """Extracts column names enclosed in curly braces from the user instruction. + For example, `_format_instruction(["city", "continent"], "{city} is in {continent}") + == "city is in continent"` + """ + return instruction.format(**{col: col for col in columns}) + + @staticmethod + def _validate_model(model): + from bigframes.ml.llm import GeminiTextGenerator + + if not isinstance(model, GeminiTextGenerator): + raise ValueError("Model is not GeminiText Generator") diff --git a/bigframes/series.py b/bigframes/series.py index 16e2eef6f1..1a913f18d7 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1923,9 +1923,9 @@ def _slice( step: typing.Optional[int] = None, ) -> bigframes.series.Series: return bigframes.series.Series( - self._block.slice(start=start, stop=stop, step=step).select_column( - self._value_column - ), + self._block.slice( + start=start, stop=stop, step=step if (step is not None) else 1 + ).select_column(self._value_column), ) def cache(self): diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 7585dd3f45..01ff1a3f15 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -102,6 +102,7 @@ def validate_table( table_ref: bigquery.table.TableReference, columns: Optional[Sequence[str]], snapshot_time: datetime.datetime, + table_type: str, filter_str: Optional[str] = None, ) -> bool: """Validates that the table can be read, returns True iff snapshot is supported.""" @@ -124,6 +125,17 @@ def validate_table( if table_ref.dataset_id.startswith("_"): return False + # Materialized views,does not support snapshot + if table_type == "MATERIALIZED_VIEW": + warnings.warn( + "Materialized views do not support FOR SYSTEM_TIME AS OF queries. " + "Attempting query without time travel. Be aware that as materialized views " + "are updated periodically, modifications to the underlying data in the view may " + "result in errors or unexpected behavior.", + category=bigframes.exceptions.TimeTravelDisabledWarning, + ) + return False + # Second, try with snapshot to verify table supports this feature snapshot_sql = bigframes.session._io.bigquery.to_query( query_or_table=f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}", diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 7b53d40f74..04cd1a2ff0 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -31,21 +31,13 @@ import ibis import pydata_google_auth +import bigframes.constants import bigframes.version _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" _APPLICATION_NAME = f"bigframes/{bigframes.version.__version__} ibis/{ibis.__version__}" _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] -# Regions for which Regional Endpoints (REPs) are supported -_REP_SUPPORTED_REGIONS = { - "me-central2", - "europe-west9", - "europe-west3", - "us-east4", - "us-west1", -} - # BigQuery is a REST API, which requires the protocol as part of the URL. _BIGQUERY_LOCATIONAL_ENDPOINT = "https://{location}-bigquery.googleapis.com" @@ -129,7 +121,8 @@ def _create_bigquery_client(self): api_endpoint=( _BIGQUERY_REGIONAL_ENDPOINT if self._location is not None - and self._location.lower() in _REP_SUPPORTED_REGIONS + and self._location.lower() + in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS else _BIGQUERY_LOCATIONAL_ENDPOINT ).format(location=self._location), ) @@ -201,7 +194,8 @@ def bqstoragereadclient(self): api_endpoint=( _BIGQUERYSTORAGE_REGIONAL_ENDPOINT if self._location is not None - and self._location.lower() in _REP_SUPPORTED_REGIONS + and self._location.lower() + in bigframes.constants.REP_ENABLED_BIGQUERY_LOCATIONS else _BIGQUERYSTORAGE_LOCATIONAL_ENDPOINT ).format(location=self._location), ) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 8508c714fd..ab2ebed0d4 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -45,6 +45,7 @@ import bigframes.core.identifiers import bigframes.core.nodes as nodes import bigframes.core.ordering as order +import bigframes.core.rewrite as rewrites import bigframes.core.schema import bigframes.core.tree_properties as tree_properties import bigframes.features @@ -186,7 +187,7 @@ def iterator_supplier(): # Runs strict validations to ensure internal type predictions and ibis are completely in sync # Do not execute these validations outside of testing suite. if "PYTEST_CURRENT_TEST" in os.environ and len(col_id_overrides) == 0: - validate_result_schema(array_value, iterator.schema) + self._validate_result_schema(array_value, iterator.schema) return ExecuteResult( arrow_batches=iterator_supplier, @@ -436,6 +437,7 @@ def _get_optimized_plan(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: if ENABLE_PRUNING: used_fields = frozenset(field.id for field in optimized_plan.fields) optimized_plan = optimized_plan.prune(used_fields) + optimized_plan = rewrites.replace_slice_ops(optimized_plan) return optimized_plan def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): @@ -558,6 +560,27 @@ def _sql_as_cached_temp_table( query_job.result() return query_job.destination + def _validate_result_schema( + self, + array_value: bigframes.core.ArrayValue, + bq_schema: list[bigquery.schema.SchemaField], + ): + actual_schema = tuple(bq_schema) + ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( + self._get_optimized_plan(array_value.node) + ) + internal_schema = array_value.schema + if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + return + if internal_schema.to_bigquery() != actual_schema: + raise ValueError( + f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}" + ) + if ibis_schema.to_bigquery() != actual_schema: + raise ValueError( + f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}" + ) + def generate_head_plan(node: nodes.BigFrameNode, n: int): offsets_id = bigframes.core.guid.generate_guid("offsets_") @@ -578,21 +601,3 @@ def generate_head_plan(node: nodes.BigFrameNode, n: int): def generate_row_count_plan(node: nodes.BigFrameNode): return nodes.RowCountNode(node) - - -def validate_result_schema( - array_value: bigframes.core.ArrayValue, bq_schema: list[bigquery.schema.SchemaField] -): - actual_schema = tuple(bq_schema) - ibis_schema = array_value._compiled_schema - internal_schema = array_value.schema - if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - return - if internal_schema.to_bigquery() != actual_schema: - raise ValueError( - f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}" - ) - if ibis_schema.to_bigquery() != actual_schema: - raise ValueError( - f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}" - ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 22de367804..923605627d 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -339,7 +339,12 @@ def read_gbq_table( ) enable_snapshot = enable_snapshot and bf_read_gbq_table.validate_table( - self._bqclient, table_ref, all_columns, time_travel_timestamp, filter_str + self._bqclient, + table_ref, + all_columns, + time_travel_timestamp, + table.table_type, + filter_str, ) # ---------------------------- diff --git a/bigframes/version.py b/bigframes/version.py index c07f26bc6f..75f66191ca 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.21.0" +__version__ = "1.22.0" diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb new file mode 100644 index 0000000000..bfaad69ce2 --- /dev/null +++ b/notebooks/experimental/semantic_operators.ipynb @@ -0,0 +1,1749 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Enable the semantic operator experiment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:33: UserWarning: Semantic operators are still under experiments, and are subject to change in the future.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.semantic_operators = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the LLM model. Here we are going to use Gemini 1.5 Flash." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:559: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return global_session.get_global_session()\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 1494d834-8b38-4928-9911-ba3bb9b1228b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 6caa309b-492d-4ad3-94e3-cb2b9522ef1e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import bigframes.ml.llm as llm\n", + "gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Filtering" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d56e32bd-f06a-4086-aac2-560ed03dceca is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 0b96351f-5a48-4059-b830-1aebd330599f is DONE. 4 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 34b2ce70-b9be-49bb-a06d-f228b0e5937c is DONE. 33 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a4f799eb-24d6-4fcf-8661-371226788b53 is DONE. 33 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycity
1GermanyBerlin
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " country city\n", + "1 Germany Berlin\n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n", + "df.semantics.filter(\"{city} is the capital of {country}\", gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = bpd.DataFrame(\n", + " data={\"ingredient_1\": [\"Burger Bun\", \"Soy Bean\"], \"ingredient_2\": [\"Beef Patty\", \"Bittern\"]}\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 04a27084-a71e-4c2d-9a73-46b768615c94 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 1a4c0d7f-0bb4-4f16-b2c0-ebb930fa6cd1 is DONE. 4 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 87bf5653-d3d8-4c0a-8017-af43907465de is DONE. 34 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 12822e33-0ca3-4968-a685-7fcb2bdb0790 is DONE. 93 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2food
0Burger BunBeef PattyBurger
1Soy BeanBitternTofu
\n", + "

2 rows × 3 columns

\n", + "
[2 rows x 3 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2 food\n", + "0 Burger Bun Beef Patty Burger \n", + "\n", + "1 Soy Bean Bittern Tofu \n", + "\n", + "\n", + "[2 rows x 3 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Joining" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", + "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job fcda7d35-d969-47a8-b611-0c516e2e39e8 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job c532592c-c4ce-4f08-9397-21b1b8b1f347 is DONE. 30 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a11bd20f-7a75-462c-b6a5-64d954645e1b is DONE. 251 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4703c2a9-ab08-46f1-a612-3354c5df391f is DONE. 144 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citycontinent
0SeattleNorth America
1OttawaNorth America
2ShanghaiAsia
3New DelhiAsia
\n", + "

4 rows × 2 columns

\n", + "
[4 rows x 2 columns in total]" + ], + "text/plain": [ + " city continent\n", + "0 Seattle North America\n", + "1 Ottawa North America\n", + "2 Shanghai Asia\n", + "3 New Delhi Asia\n", + "\n", + "[4 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cities.semantics.join(continents, \"{city} is in {continent}\", gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Self Joins" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 8c1f1313-3eee-47dc-ad2d-27a49dc831dc is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 08dda435-13bd-49d0-a941-1cf91a9a1c96 is DONE. 32 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job abf33f67-0056-499b-b7fe-583391c6bc02 is DONE. 266 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 74249b99-8975-4fc4-b599-1b682edf8aeb is DONE. 180 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animal_leftanimal_right
0cowcat
1cowspider
2catspider
3elephantcow
4elephantcat
5elephantspider
\n", + "

6 rows × 2 columns

\n", + "
[6 rows x 2 columns in total]" + ], + "text/plain": [ + " animal_left animal_right\n", + "0 cow cat\n", + "1 cow spider\n", + "2 cat spider\n", + "3 elephant cow\n", + "4 elephant cat\n", + "5 elephant spider\n", + "\n", + "[6 rows x 2 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "animals.semantics.join(animals, \"{animal_left} generally weighs heavier than {animal_right}\", gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Search" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 8be41631-537e-4b73-b3c8-1cad09dffb95 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creatures
0salmon
1sea urchin
2baboons
3frog
4chimpanzee
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " creatures\n", + "0 salmon\n", + "1 sea urchin\n", + "2 baboons\n", + "3 frog\n", + "4 chimpanzee\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 56d5f17f-f64a-46ca-8d30-74f8e2ad5dec is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job fe75b64a-41a3-4675-ae1e-d2db6b2270d3 is DONE. 10 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9f06c24e-d931-4e59-a444-1a6013c43290 is DONE. 30.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 627b8206-b3f9-4c25-a5d9-dde7c0042a4d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job d01597bb-30ef-495f-be5d-c9fb16d4c112 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bbc67bc3-830d-4ede-829d-16d4829dec33 is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0c844655-b7d9-494b-8073-925b4e0743ce is DONE. 37.2 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 1993f0be-bfc2-4dad-ba85-92f5bba44945 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creaturessimilarity score
2baboons0.773411
4chimpanzee0.781101
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " creatures similarity score\n", + "2 baboons 0.773411\n", + "4 chimpanzee 0.781101\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.semantics.search(\"creatures\", \"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Similarity Join" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", + "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 222a9dcb-2389-4ad3-a1e6-c2b197f3a409 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 24afcd9d-6be5-44d9-aa89-6fbe71f5e9a7 is DONE. 10 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 6bc36226-7bbb-4954-b042-044e9fd98a47 is DONE. 30.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f247f63d-1d8a-4f81-a833-628143fda463 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 9bec5633-8ba1-4453-b9c7-6cb555d3c60e is DONE. 10 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d7df7004-b499-436b-898c-15abee330d9e is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9012c011-b4e7-4fba-85a6-e439fe3c32d3 is DONE. 61.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bb9987eb-aa37-42ca-bcf1-1ea575a147a8 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalanimal_1distance
0monkeybaboon0.747665
1spiderscorpion0.890909
2salmontuna0.925461
3giraffeelephant0.887858
4sparrowowl0.932959
\n", + "

5 rows × 3 columns

\n", + "
[5 rows x 3 columns in total]" + ], + "text/plain": [ + " animal animal_1 distance\n", + "0 monkey baboon 0.747665\n", + "1 spider scorpion 0.890909\n", + "2 salmon tuna 0.925461\n", + "3 giraffe elephant 0.887858\n", + "4 sparrow owl 0.932959\n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model, score_column='distance')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 46e1cbb4-2b4a-4578-b3fd-7caba80d5dcc is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 356840f4-840c-41fc-9c9e-8bbaf9ffa02c is DONE. 4 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 428d070e-fd5c-4b2f-b651-b3de9836c02a is DONE. 12.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bf566989-7bd4-4560-952e-34d007ee1e7e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 92818df7-d4e9-4cea-884e-304126e78b71 is DONE. 4 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job e8619330-7b91-4ae2-99b3-f4386de4c512 is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job be89eca7-462a-4b1c-95ed-0b0c031aaaac is DONE. 24.6 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 40dcd8ed-1262-459a-b6b3-7471722da078 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalanimal_1
0monkeybaboon
1spiderscorpion
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " animal animal_1\n", + "0 monkey baboon\n", + "1 spider scorpion\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})\n", + "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})\n", + "\n", + "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ea1e5180-a13a-4ec7-a6b4-8eca042ac9a6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MoviesYear
0Titanic1997
1The Wolf of Wall Street2013
2Killers of the Flower Moon2023
3The Revenant2015
4Inception2010
5Shuttle Island2010
6The Great Gatsby2013
\n", + "

7 rows × 2 columns

\n", + "
[7 rows x 2 columns in total]" + ], + "text/plain": [ + " Movies Year\n", + "0 Titanic 1997\n", + "1 The Wolf of Wall Street 2013\n", + "2 Killers of the Flower Moon 2023\n", + "3 The Revenant 2015\n", + "4 Inception 2010\n", + "5 Shuttle Island 2010\n", + "6 The Great Gatsby 2013\n", + "\n", + "[7 rows x 2 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\n", + " \"Movies\": [\n", + " \"Titanic\",\n", + " \"The Wolf of Wall Street\",\n", + " \"Killers of the Flower Moon\",\n", + " \"The Revenant\",\n", + " \"Inception\",\n", + " \"Shuttle Island\",\n", + " \"The Great Gatsby\",\n", + " ],\n", + " \"Year\": [1997, 2013, 2023, 2015, 2010, 2010, 2013],\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 274df4fb-06ee-49d8-8e7f-2c7eaee3440f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 878b41c8-6428-4f05-aa0b-dcba14761ac0 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5a909cb7-fcbf-43d5-aac2-79b7ba466dd3 is DONE. 16 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 10f97d30-101c-447c-876c-d329d3a6d89b is DONE. 28 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b1b94183-6ad4-4014-94da-7d585d45bc6d is DONE. 28 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Leonardo \n", + "\n", + "Name: Movies, dtype: string" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg_df = df.semantics.agg(\"Find the shared first name of actors in {Movies}. One word answer.\", model=gemini_model)\n", + "agg_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job e52f886a-1f87-45fc-990d-e66c23417a66 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 82ac6302-78a1-41f7-8665-769887a47d42 is DONE. 10 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job cd42b04e-e9ea-4b56-a891-78608dbef215 is DONE. 30.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job dced08f2-12ee-4b52-b5b2-b7dd177dae12 is DONE. 30.7 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5cbdac9b-f5dd-488c-8262-7a96f8501faa is DONE. 138.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job e30ff06e-b561-4ea2-b150-8cd91d4f827c is DONE. 80 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 278d8a51-711a-42fe-86aa-408b2b44d4c7 is DONE. 170 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ProductCluster ID
0Smartphone3
1Laptop3
2Coffee Maker1
3T-shirt2
4Jeans2
\n", + "

5 rows × 2 columns

\n", + "
[5 rows x 2 columns in total]" + ], + "text/plain": [ + " Product Cluster ID\n", + "0 Smartphone 3\n", + "1 Laptop 3\n", + "2 Coffee Maker 1\n", + "3 T-shirt 2\n", + "4 Jeans 2\n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})\n", + "\n", + "df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n_clusters=3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb index 824d911aff..4bfdcc24aa 100644 --- a/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb +++ b/notebooks/ml/bq_dataframes_ml_cross_validation.ipynb @@ -272,7 +272,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2.1 Define KFold class and Train/Test for Each Fold (Mauanl Approach)" + "## 2.1 Define KFold class and Train/Test for Each Fold (Manual Approach)" ] }, { diff --git a/noxfile.py b/noxfile.py index 714c8333bd..92f8acad7f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -23,7 +23,6 @@ import re import shutil import time -import traceback from typing import Dict, List import warnings @@ -794,10 +793,6 @@ def notebook(session: nox.Session): *notebooks, ) - # Shared flag using multiprocessing.Manager() to indicate if - # any process encounters an error. This flag may be updated - # across different processes. - error_flag = multiprocessing.Manager().Value("i", False) processes = [] for notebook in notebooks: args = ( @@ -808,8 +803,8 @@ def notebook(session: nox.Session): ) if multi_process_mode: process = multiprocessing.Process( - target=_run_process, - args=(session, args, error_flag), + target=session.run, + args=args, ) process.start() processes.append(process) @@ -819,10 +814,6 @@ def notebook(session: nox.Session): else: session.run(*args) - for process in processes: - process.join() - - processes = [] for notebook, regions in notebooks_reg.items(): for region in regions: region_args = ( @@ -834,8 +825,8 @@ def notebook(session: nox.Session): ) if multi_process_mode: process = multiprocessing.Process( - target=_run_process, - args=(session, region_args, error_flag), + target=session.run, + args=region_args, ) process.start() processes.append(process) @@ -847,11 +838,6 @@ def notebook(session: nox.Session): for process in processes: process.join() - - # Check the shared error flag and raise an exception if any process - # reported an error - if error_flag.value: - raise Exception("Errors occurred in one or more subprocesses.") finally: # Prevent our notebook changes from getting checked in to git # accidentally. @@ -868,15 +854,6 @@ def notebook(session: nox.Session): ) -def _run_process(session: nox.Session, args, error_flag): - try: - session.run(*args) - except Exception: - traceback_str = traceback.format_exc() - print(traceback_str) - error_flag.value = True - - @nox.session(python=DEFAULT_PYTHON_VERSION) def benchmark(session: nox.Session): session.install("-e", ".[all]") diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py index a42301cb13..8b55493770 100644 --- a/scripts/run_and_publish_benchmark.py +++ b/scripts/run_and_publish_benchmark.py @@ -17,10 +17,11 @@ import json import os import pathlib +import re import subprocess import sys import tempfile -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union import numpy as np import pandas as pd @@ -30,7 +31,7 @@ CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() -def run_benchmark_subprocess(args, log_env_name_var, filename=None, region=None): +def run_benchmark_subprocess(args, log_env_name_var, file_path=None, region=None): """ Runs a benchmark subprocess with configured environment variables. Adjusts PYTHONPATH, sets region-specific BigQuery location, and logs environment variables. @@ -48,10 +49,37 @@ def run_benchmark_subprocess(args, log_env_name_var, filename=None, region=None) if region: env["BIGQUERY_LOCATION"] = region env[LOGGING_NAME_ENV_VAR] = log_env_name_var - subprocess.run(args, env=env, check=True) - - -def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFrame: + try: + if file_path: # Notebooks + duration_pattern = re.compile(r"(\d+\.\d+)s call") + process = subprocess.Popen(args, env=env, stdout=subprocess.PIPE, text=True) + assert process.stdout is not None + for line in process.stdout: + print(line, end="") + match = duration_pattern.search(line) + if match: + duration = match.group(1) + with open(f"{file_path}.local_exec_time_seconds", "w") as f: + f.write(f"{duration}\n") + process.wait() + if process.returncode != 0: + raise subprocess.CalledProcessError(process.returncode, args) + else: # Benchmarks + file_path = log_env_name_var + subprocess.run(args, env=env, check=True) + except Exception: + directory = pathlib.Path(file_path).parent + for file in directory.glob(f"{pathlib.Path(file_path).name}.*"): + if file.suffix != ".backup": + print(f"Benchmark failed, deleting: {file}") + file.unlink() + error_file = directory / f"{pathlib.Path(file_path).name}.error" + error_file.touch() + + +def collect_benchmark_result( + benchmark_path: str, iterations: int +) -> Tuple[pd.DataFrame, Union[str, None]]: """Generate a DataFrame report on HTTP queries, bytes processed, slot time and execution time from log files.""" path = pathlib.Path(benchmark_path) try: @@ -59,25 +87,18 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra bytes_files = sorted(path.rglob("*.bytesprocessed")) millis_files = sorted(path.rglob("*.slotmillis")) bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds")) - local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds")) - has_local_seconds = len(local_seconds_files) > 0 - - if has_local_seconds: - if not ( - len(bytes_files) - == len(millis_files) - == len(local_seconds_files) - == len(bq_seconds_files) - ): - raise ValueError( - "Mismatch in the number of report files for bytes, millis, and seconds." - ) - else: - if not (len(bytes_files) == len(millis_files) == len(bq_seconds_files)): - raise ValueError( - "Mismatch in the number of report files for bytes, millis, and seconds." - ) + error_files = sorted(path.rglob("*.error")) + + if not ( + len(bytes_files) + == len(millis_files) + == len(local_seconds_files) + == len(bq_seconds_files) + ): + raise ValueError( + "Mismatch in the number of report files for bytes, millis, and seconds." + ) for idx in range(len(bytes_files)): bytes_file = bytes_files[idx] @@ -92,12 +113,11 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra "File name mismatch among bytes, millis, and seconds reports." ) - if has_local_seconds: - local_seconds_file = local_seconds_files[idx] - if filename != local_seconds_file.relative_to(path).with_suffix(""): - raise ValueError( - "File name mismatch among bytes, millis, and seconds reports." - ) + local_seconds_file = local_seconds_files[idx] + if filename != local_seconds_file.relative_to(path).with_suffix(""): + raise ValueError( + "File name mismatch among bytes, millis, and seconds reports." + ) with open(bytes_file, "r") as file: lines = file.read().splitlines() @@ -108,12 +128,9 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra lines = file.read().splitlines() total_slot_millis = sum(int(line) for line in lines) / iterations - if has_local_seconds: - with open(local_seconds_file, "r") as file: - lines = file.read().splitlines() - local_seconds = sum(float(line) for line in lines) / iterations - else: - local_seconds = None + with open(local_seconds_file, "r") as file: + lines = file.read().splitlines() + local_seconds = sum(float(line) for line in lines) / iterations with open(bq_seconds_file, "r") as file: lines = file.read().splitlines() @@ -132,6 +149,7 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra path.rglob("*.slotmillis"), path.rglob("*.local_exec_time_seconds"), path.rglob("*.bq_exec_time_seconds"), + path.rglob("*.error"), ): for log_file in files_to_remove: log_file.unlink() @@ -170,13 +188,19 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds" ) - geometric_mean_queries = geometric_mean(benchmark_metrics["Query_Count"]) - geometric_mean_bytes = geometric_mean(benchmark_metrics["Bytes_Processed"]) - geometric_mean_slot_millis = geometric_mean(benchmark_metrics["Slot_Millis"]) - geometric_mean_local_seconds = geometric_mean( + geometric_mean_queries = geometric_mean_excluding_zeros( + benchmark_metrics["Query_Count"] + ) + geometric_mean_bytes = geometric_mean_excluding_zeros( + benchmark_metrics["Bytes_Processed"] + ) + geometric_mean_slot_millis = geometric_mean_excluding_zeros( + benchmark_metrics["Slot_Millis"] + ) + geometric_mean_local_seconds = geometric_mean_excluding_zeros( benchmark_metrics["Local_Execution_Time_Sec"] ) - geometric_mean_bq_seconds = geometric_mean( + geometric_mean_bq_seconds = geometric_mean_excluding_zeros( benchmark_metrics["BigQuery_Execution_Time_Sec"] ) @@ -188,15 +212,33 @@ def collect_benchmark_result(benchmark_path: str, iterations: int) -> pd.DataFra f"Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---" ) - return benchmark_metrics.reset_index().rename(columns={"index": "Benchmark_Name"}) + error_message = ( + "\n" + + "\n".join( + [ + f"Failed: {error_file.relative_to(path).with_suffix('')}" + for error_file in error_files + ] + ) + if error_files + else None + ) + return ( + benchmark_metrics.reset_index().rename(columns={"index": "Benchmark_Name"}), + error_message, + ) -def geometric_mean(data): +def geometric_mean_excluding_zeros(data): """ - Calculate the geometric mean of a dataset, rounding the result to one decimal place. - Returns NaN if the dataset is empty or contains only NaN values. + Calculate the geometric mean of a dataset, excluding any zero values. + Returns NaN if the dataset is empty, contains only NaN values, or if + all non-NaN values are zeros. + + The result is rounded to one decimal place. """ data = data.dropna() + data = data[data != 0] if len(data) == 0: return np.nan log_data = np.log(data) @@ -321,13 +363,15 @@ def run_notebook_benchmark(benchmark_file: str, region: str): "py.test", "--nbmake", "--nbmake-timeout=900", # 15 minutes + "--durations=0", + "--color=yes", ] benchmark_args = (*pytest_command, benchmark_file) run_benchmark_subprocess( args=benchmark_args, log_env_name_var=log_env_name_var, - filename=export_file, + file_path=export_file, region=region, ) @@ -383,7 +427,7 @@ def main(): args = parse_arguments() if args.publish_benchmarks: - benchmark_metrics = collect_benchmark_result( + benchmark_metrics, error_message = collect_benchmark_result( args.publish_benchmarks, args.iterations ) # Output results to CSV without specifying a location @@ -412,6 +456,9 @@ def main(): # intended for local testing where the default behavior is not to publish results. elif project := os.getenv("GCLOUD_BENCH_PUBLISH_PROJECT", ""): publish_to_bigquery(benchmark_metrics, args.notebook, project) + + if error_message: + raise Exception(error_message) elif args.notebook: run_notebook_benchmark(args.benchmark_path, args.region) else: diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index a30c36065b..e5b7585514 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -11,6 +11,28 @@ This section lists the benchmarks currently available, with descriptions and lin - **TPC-H Benchmark**: Based on the TPC-H standards, this benchmark evaluates transaction processing capabilities. It is adapted from code found in the Polars repository, specifically tailored to test and compare these capabilities. Details are available on the [Polars Benchmark GitHub repository](https://github.com/pola-rs/polars-benchmark). - **Notebooks**: These Jupyter notebooks showcase BigFrames' key features and patterns, and also enable performance benchmarking. Explore them at the [BigFrames Notebooks repository](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks). +## Benchmark Configuration Using `config.jsonl` Files + +For each benchmark, a corresponding `config.jsonl` file exists in the same folder or its parent folder. These configuration files allow users to control various benchmark parameters without modifying the code directly. By updating the relevant `config.jsonl` file in the specific benchmark's folder, you can easily configure settings such as: +- **benchmark_suffix**: A suffix appended to the benchmark name for identification purposes. +- **ordered**: Controls the mode for BigFrames, specifying whether to use ordered (`true`) or unordered mode (`false`). +- **project_id**: The Google Cloud project ID where the benchmark dataset or table is located. +- **dataset_id**: The dataset ID for querying during the benchmark. +- **table_id**: This is **required** for benchmarks like `dbbenchmark` that target a specific table, but is **not configurable** for benchmarks like `TPC-H`, which use multiple tables with fixed names. + +### Example `config.jsonl` Files + +#### `dbbenchmark` Example +```jsonl +{"benchmark_suffix": "50g_ordered", "project_id": "your-google-cloud-project", "dataset_id": "dbbenchmark", "table_id": "G1_1e9_1e2_5_0", "ordered": true} +{"benchmark_suffix": "50g_unordered", "project_id": "your-google-cloud-project", "dataset_id": "dbbenchmark", "table_id": "G1_1e9_1e2_5_0", "ordered": false} +``` + +#### `TPC-H` Example +```jsonl +{"benchmark_suffix": "10t_unordered", "project_id": "your-google-cloud-project", "dataset_id": "tpch_0010t", "ordered": false} +``` + ## Usage Examples Our benchmarking process runs internally on a daily basis to continuously monitor the performance of BigFrames. However, there are occasions when you might need to conduct benchmarking locally to test specific changes or new features. diff --git a/tests/benchmark/db_benchmark/groupby/config.jsonl b/tests/benchmark/db_benchmark/groupby/config.jsonl index dd881e76ac..b6f23ebbf7 100644 --- a/tests/benchmark/db_benchmark/groupby/config.jsonl +++ b/tests/benchmark/db_benchmark/groupby/config.jsonl @@ -1,2 +1,2 @@ -{"benchmark_suffix": "50g_ordered", "table_id": "G1_1e9_1e2_5_0", "ordered": true} -{"benchmark_suffix": "50g_unordered", "table_id": "G1_1e9_1e2_5_0", "ordered": false} +{"benchmark_suffix": "50g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "G1_1e9_1e2_5_0", "ordered": true} +{"benchmark_suffix": "50g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "G1_1e9_1e2_5_0", "ordered": false} diff --git a/tests/benchmark/db_benchmark/groupby/q1.py b/tests/benchmark/db_benchmark/groupby/q1.py index 02a709def9..dc86817908 100644 --- a/tests/benchmark/db_benchmark/groupby/q1.py +++ b/tests/benchmark/db_benchmark/groupby/q1.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q1, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q1, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q10.py b/tests/benchmark/db_benchmark/groupby/q10.py index 0cd195b04a..99d28e2f9a 100644 --- a/tests/benchmark/db_benchmark/groupby/q10.py +++ b/tests/benchmark/db_benchmark/groupby/q10.py @@ -18,13 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( vendored_dbbenchmark_groupby_queries.q10, current_path, suffix, + project_id, + dataset_id, table_id, session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q2.py b/tests/benchmark/db_benchmark/groupby/q2.py index 398c63e09f..b06a4189fe 100644 --- a/tests/benchmark/db_benchmark/groupby/q2.py +++ b/tests/benchmark/db_benchmark/groupby/q2.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q2, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q2, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q3.py b/tests/benchmark/db_benchmark/groupby/q3.py index 9863b969d8..d66dd7b39d 100644 --- a/tests/benchmark/db_benchmark/groupby/q3.py +++ b/tests/benchmark/db_benchmark/groupby/q3.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q3, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q3, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q4.py b/tests/benchmark/db_benchmark/groupby/q4.py index ce29e3ceaf..6c72069a53 100644 --- a/tests/benchmark/db_benchmark/groupby/q4.py +++ b/tests/benchmark/db_benchmark/groupby/q4.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q4, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q4, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q5.py b/tests/benchmark/db_benchmark/groupby/q5.py index 27a4a52a8f..3e6db9783e 100644 --- a/tests/benchmark/db_benchmark/groupby/q5.py +++ b/tests/benchmark/db_benchmark/groupby/q5.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q5, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q5, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q6.py b/tests/benchmark/db_benchmark/groupby/q6.py index f1befc6840..f763280b5b 100644 --- a/tests/benchmark/db_benchmark/groupby/q6.py +++ b/tests/benchmark/db_benchmark/groupby/q6.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q6, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q6, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q7.py b/tests/benchmark/db_benchmark/groupby/q7.py index c0791612e8..4e7f2d58b6 100644 --- a/tests/benchmark/db_benchmark/groupby/q7.py +++ b/tests/benchmark/db_benchmark/groupby/q7.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q7, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q7, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/groupby/q8.py b/tests/benchmark/db_benchmark/groupby/q8.py index 46dd8c45eb..75d5dcaa0c 100644 --- a/tests/benchmark/db_benchmark/groupby/q8.py +++ b/tests/benchmark/db_benchmark/groupby/q8.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.groupby_queries as vendored_dbbenchmark_groupby_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_groupby_queries.q8, current_path, suffix, table_id, session + vendored_dbbenchmark_groupby_queries.q8, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/join/config.jsonl b/tests/benchmark/db_benchmark/join/config.jsonl index 72884d6c5a..e709281137 100644 --- a/tests/benchmark/db_benchmark/join/config.jsonl +++ b/tests/benchmark/db_benchmark/join/config.jsonl @@ -1,2 +1,2 @@ -{"benchmark_suffix": "50g_ordered", "table_id": "J1_1e9_NA_0_0", "ordered": true} -{"benchmark_suffix": "50g_unordered", "table_id": "J1_1e9_NA_0_0", "ordered": false} +{"benchmark_suffix": "50g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "J1_1e9_NA_0_0", "ordered": true} +{"benchmark_suffix": "50g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "J1_1e9_NA_0_0", "ordered": false} diff --git a/tests/benchmark/db_benchmark/join/q1.py b/tests/benchmark/db_benchmark/join/q1.py index ce05359789..4ca0ee3389 100644 --- a/tests/benchmark/db_benchmark/join/q1.py +++ b/tests/benchmark/db_benchmark/join/q1.py @@ -18,10 +18,22 @@ import bigframes_vendored.db_benchmark.join_queries as vendored_dbbenchmark_join_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_join_queries.q1, current_path, suffix, table_id, session + vendored_dbbenchmark_join_queries.q1, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/join/q2.py b/tests/benchmark/db_benchmark/join/q2.py index 6c9141b316..19efd6fbf2 100644 --- a/tests/benchmark/db_benchmark/join/q2.py +++ b/tests/benchmark/db_benchmark/join/q2.py @@ -18,10 +18,22 @@ import bigframes_vendored.db_benchmark.join_queries as vendored_dbbenchmark_join_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_join_queries.q2, current_path, suffix, table_id, session + vendored_dbbenchmark_join_queries.q2, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/join/q3.py b/tests/benchmark/db_benchmark/join/q3.py index 284ab6a2b3..d0a931bfb2 100644 --- a/tests/benchmark/db_benchmark/join/q3.py +++ b/tests/benchmark/db_benchmark/join/q3.py @@ -18,10 +18,22 @@ import bigframes_vendored.db_benchmark.join_queries as vendored_dbbenchmark_join_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_join_queries.q3, current_path, suffix, table_id, session + vendored_dbbenchmark_join_queries.q3, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/join/q4.py b/tests/benchmark/db_benchmark/join/q4.py index 1504e0a663..ebd7c461d0 100644 --- a/tests/benchmark/db_benchmark/join/q4.py +++ b/tests/benchmark/db_benchmark/join/q4.py @@ -18,10 +18,22 @@ import bigframes_vendored.db_benchmark.join_queries as vendored_dbbenchmark_join_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_join_queries.q4, current_path, suffix, table_id, session + vendored_dbbenchmark_join_queries.q4, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/join/q5.py b/tests/benchmark/db_benchmark/join/q5.py index 575b3711e6..7114acd408 100644 --- a/tests/benchmark/db_benchmark/join/q5.py +++ b/tests/benchmark/db_benchmark/join/q5.py @@ -18,10 +18,22 @@ import bigframes_vendored.db_benchmark.join_queries as vendored_dbbenchmark_join_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_join_queries.q5, current_path, suffix, table_id, session + vendored_dbbenchmark_join_queries.q5, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/db_benchmark/sort/config.jsonl b/tests/benchmark/db_benchmark/sort/config.jsonl index 72884d6c5a..e709281137 100644 --- a/tests/benchmark/db_benchmark/sort/config.jsonl +++ b/tests/benchmark/db_benchmark/sort/config.jsonl @@ -1,2 +1,2 @@ -{"benchmark_suffix": "50g_ordered", "table_id": "J1_1e9_NA_0_0", "ordered": true} -{"benchmark_suffix": "50g_unordered", "table_id": "J1_1e9_NA_0_0", "ordered": false} +{"benchmark_suffix": "50g_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "J1_1e9_NA_0_0", "ordered": true} +{"benchmark_suffix": "50g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "dbbenchmark", "table_id": "J1_1e9_NA_0_0", "ordered": false} diff --git a/tests/benchmark/db_benchmark/sort/q1.py b/tests/benchmark/db_benchmark/sort/q1.py index f17a843192..5f6c404443 100644 --- a/tests/benchmark/db_benchmark/sort/q1.py +++ b/tests/benchmark/db_benchmark/sort/q1.py @@ -18,9 +18,21 @@ import bigframes_vendored.db_benchmark.sort_queries as vendored_dbbenchmark_sort_queries if __name__ == "__main__": - table_id, session, suffix = utils.get_dbbenchmark_configuration() + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( - vendored_dbbenchmark_sort_queries.q1, current_path, suffix, table_id, session + vendored_dbbenchmark_sort_queries.q1, + current_path, + suffix, + project_id, + dataset_id, + table_id, + session, ) diff --git a/tests/benchmark/tpch/q1.py b/tests/benchmark/tpch/q1.py index 3f1c63967e..a672103931 100644 --- a/tests/benchmark/tpch/q1.py +++ b/tests/benchmark/tpch/q1.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q1 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q10.py b/tests/benchmark/tpch/q10.py index bea18975ca..d468a90156 100644 --- a/tests/benchmark/tpch/q10.py +++ b/tests/benchmark/tpch/q10.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q10 as vendored_tpch_q10 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q11.py b/tests/benchmark/tpch/q11.py index 538d8d3e5f..dbf3fd94de 100644 --- a/tests/benchmark/tpch/q11.py +++ b/tests/benchmark/tpch/q11.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q11 as vendored_tpch_q11 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q12.py b/tests/benchmark/tpch/q12.py index 6503b543f4..57774457ae 100644 --- a/tests/benchmark/tpch/q12.py +++ b/tests/benchmark/tpch/q12.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q12 as vendored_tpch_q12 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q13.py b/tests/benchmark/tpch/q13.py index 60c2101f6f..a7f2780e4b 100644 --- a/tests/benchmark/tpch/q13.py +++ b/tests/benchmark/tpch/q13.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q13 as vendored_tpch_q13 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q14.py b/tests/benchmark/tpch/q14.py index 1698a01628..e9599f3bd8 100644 --- a/tests/benchmark/tpch/q14.py +++ b/tests/benchmark/tpch/q14.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q14 as vendored_tpch_q14 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q15.py b/tests/benchmark/tpch/q15.py index 49e2ce4e92..ff200384a8 100644 --- a/tests/benchmark/tpch/q15.py +++ b/tests/benchmark/tpch/q15.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q15 as vendored_tpch_q15 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q16.py b/tests/benchmark/tpch/q16.py index ef6edf6b12..69fc1b9523 100644 --- a/tests/benchmark/tpch/q16.py +++ b/tests/benchmark/tpch/q16.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q16 as vendored_tpch_q16 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q17.py b/tests/benchmark/tpch/q17.py index 2f680d206e..14707f4a93 100644 --- a/tests/benchmark/tpch/q17.py +++ b/tests/benchmark/tpch/q17.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q17 as vendored_tpch_q17 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q18.py b/tests/benchmark/tpch/q18.py index 7336246f1b..54cf0d0432 100644 --- a/tests/benchmark/tpch/q18.py +++ b/tests/benchmark/tpch/q18.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q18 as vendored_tpch_q18 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q19.py b/tests/benchmark/tpch/q19.py index 3bf34794bf..1ec44391ff 100644 --- a/tests/benchmark/tpch/q19.py +++ b/tests/benchmark/tpch/q19.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q19 as vendored_tpch_q19 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q2.py b/tests/benchmark/tpch/q2.py index c738aae124..da8064b400 100644 --- a/tests/benchmark/tpch/q2.py +++ b/tests/benchmark/tpch/q2.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q2 as vendored_tpch_q2 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q20.py b/tests/benchmark/tpch/q20.py index 1b254c6a78..33e4f72ef6 100644 --- a/tests/benchmark/tpch/q20.py +++ b/tests/benchmark/tpch/q20.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q20 as vendored_tpch_q20 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q21.py b/tests/benchmark/tpch/q21.py index 18e8df87fe..f73f87725f 100644 --- a/tests/benchmark/tpch/q21.py +++ b/tests/benchmark/tpch/q21.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q21 as vendored_tpch_q21 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q22.py b/tests/benchmark/tpch/q22.py index 6c10021c2b..0a6f6d923c 100644 --- a/tests/benchmark/tpch/q22.py +++ b/tests/benchmark/tpch/q22.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q22 as vendored_tpch_q22 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q3.py b/tests/benchmark/tpch/q3.py index 5bcaaa0d5d..92322eea21 100644 --- a/tests/benchmark/tpch/q3.py +++ b/tests/benchmark/tpch/q3.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q3 as vendored_tpch_q3 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q4.py b/tests/benchmark/tpch/q4.py index 462c6336d1..2d6931d6b1 100644 --- a/tests/benchmark/tpch/q4.py +++ b/tests/benchmark/tpch/q4.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q4 as vendored_tpch_q4 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q5.py b/tests/benchmark/tpch/q5.py index 108cde58cc..e8fd83e193 100644 --- a/tests/benchmark/tpch/q5.py +++ b/tests/benchmark/tpch/q5.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q5 as vendored_tpch_q5 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q6.py b/tests/benchmark/tpch/q6.py index ccefc1b0bf..152d6c663e 100644 --- a/tests/benchmark/tpch/q6.py +++ b/tests/benchmark/tpch/q6.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q6 as vendored_tpch_q6 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q7.py b/tests/benchmark/tpch/q7.py index 0cad599a60..1c3e455e1c 100644 --- a/tests/benchmark/tpch/q7.py +++ b/tests/benchmark/tpch/q7.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q7 as vendored_tpch_q7 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q8.py b/tests/benchmark/tpch/q8.py index 6c6ac23b9b..8d23194834 100644 --- a/tests/benchmark/tpch/q8.py +++ b/tests/benchmark/tpch/q8.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q8 as vendored_tpch_q8 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/tpch/q9.py b/tests/benchmark/tpch/q9.py index 05c82fb66e..329e315c2c 100644 --- a/tests/benchmark/tpch/q9.py +++ b/tests/benchmark/tpch/q9.py @@ -17,7 +17,7 @@ import bigframes_vendored.tpch.queries.q9 as vendored_tpch_q9 if __name__ == "__main__": - project_id, dataset_id, session, suffix = utils.get_tpch_configuration() + project_id, dataset_id, session, suffix = utils.get_configuration() current_path = pathlib.Path(__file__).absolute() utils.get_execution_time( diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 32be33fc74..887d54dba2 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -18,43 +18,29 @@ import bigframes -def get_dbbenchmark_configuration(): +def get_configuration(include_table_id=False): parser = argparse.ArgumentParser() - parser.add_argument( - "--table_id", - type=str, - required=True, - help="The BigQuery table ID to query.", - ) - parser.add_argument( - "--ordered", - type=str, - help="Set to True (default) to have an ordered session, or False for an unordered session.", - ) - parser.add_argument( - "--benchmark_suffix", - type=str, - help="Suffix to append to benchmark names for identification purposes.", - ) - args = parser.parse_args() - session = _initialize_session(_str_to_bool(args.ordered)) - return args.table_id, session, args.benchmark_suffix - - -def get_tpch_configuration(): - parser = argparse.ArgumentParser(description="Process TPC-H Query using BigFrames.") parser.add_argument( "--project_id", type=str, required=True, - help="The BigQuery dataset ID to query.", + help="The BigQuery project ID.", ) parser.add_argument( "--dataset_id", type=str, required=True, - help="The BigQuery dataset ID to query.", + help="The BigQuery dataset ID.", ) + + if include_table_id: + parser.add_argument( + "--table_id", + type=str, + required=True, + help="The BigQuery table ID to query.", + ) + parser.add_argument( "--ordered", type=str, @@ -68,7 +54,22 @@ def get_tpch_configuration(): args = parser.parse_args() session = _initialize_session(_str_to_bool(args.ordered)) - return args.project_id, args.dataset_id, session, args.benchmark_suffix + + if include_table_id: + return ( + args.project_id, + args.dataset_id, + args.table_id, + session, + args.benchmark_suffix, + ) + else: + return ( + args.project_id, + args.dataset_id, + session, + args.benchmark_suffix, + ) def get_execution_time(func, current_path, suffix, *args, **kwargs): diff --git a/tests/system/large/operations/__init__.py b/tests/system/large/operations/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/system/large/operations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/large/operations/conftest.py b/tests/system/large/operations/conftest.py new file mode 100644 index 0000000000..7ab3811f10 --- /dev/null +++ b/tests/system/large/operations/conftest.py @@ -0,0 +1,33 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.ml.llm as llm + + +@pytest.fixture(scope="session") +def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator: + return llm.GeminiTextGenerator( + session=session, + connection_name=bq_connection, + model_name="gemini-1.5-flash-001", + ) + + +@pytest.fixture(scope="session") +def text_embedding_generator(session, bq_connection) -> llm.TextEmbeddingGenerator: + return llm.TextEmbeddingGenerator( + session=session, connection_name=bq_connection, model_name="text-embedding-004" + ) diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py new file mode 100644 index 0000000000..2d7f4756af --- /dev/null +++ b/tests/system/large/operations/test_semantics.py @@ -0,0 +1,635 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes + + +def test_semantics_experiment_off_raise_error(): + bigframes.options.experiments.semantic_operators = False + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with pytest.raises(NotImplementedError): + df.semantics + + +@pytest.mark.parametrize( + ("max_agg_rows", "cluster_column"), + [ + pytest.param(1, None, id="one", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param(2, None, id="two"), + pytest.param(3, None, id="three"), + pytest.param(4, None, id="four"), + pytest.param(5, "Year", id="two_w_cluster_column"), + pytest.param(6, "Year", id="three_w_cluster_column"), + pytest.param(7, "Year", id="four_w_cluster_column"), + ], +) +def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_column): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "Movies": [ + "Titanic", + "The Wolf of Wall Street", + "Killers of the Flower Moon", + "The Revenant", + "Inception", + "Shuttle Island", + "The Great Gatsby", + ], + "Year": [1997, 2013, 2023, 2015, 2010, 2010, 2013], + }, + session=session, + ) + instruction = "Find the shared first name of actors in {Movies}. One word answer." + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + max_agg_rows=max_agg_rows, + cluster_column=cluster_column, + ).to_pandas() + + expected_s = pd.Series(["Leonardo \n"], dtype=dtypes.STRING_DTYPE) + expected_s.name = "Movies" + pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{city} is in the {non_existing_column}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{city} is in the {country}", + id="two_columns", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + df.semantics.agg(instruction, gemini_flash_model) + + +@pytest.mark.parametrize( + "cluster_column", + [ + pytest.param( + "non_existing_column", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "Movies", id="non_int_column", marks=pytest.mark.xfail(raises=TypeError) + ), + ], +) +def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_column): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "Movies": [ + "Titanic", + "The Wolf of Wall Street", + "Killers of the Flower Moon", + "The Revenant", + ], + }, + ) + instruction = "Find the shared first name of actors in {Movies}. One word answer." + df.semantics.agg(instruction, gemini_flash_model, cluster_column=cluster_column) + + +@pytest.mark.parametrize( + ("n_clusters"), + [ + pytest.param(1, id="one", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param(2, id="two"), + pytest.param(4, id="four"), + ], +) +def test_cluster_by(session, text_embedding_generator, n_clusters): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), + session=session, + ) + output_column = "cluster id" + result = df.semantics.cluster_by( + "Product", + output_column, + text_embedding_generator, + n_clusters=n_clusters, + ) + + assert output_column in result + assert len(result[output_column].unique()) == n_clusters + + +def test_cluster_by_invalid_column(session, text_embedding_generator): + bigframes.options.experiments.semantic_operators = True + + df = dataframe.DataFrame( + ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), + session=session, + ) + + output_column = "cluster id" + with pytest.raises(ValueError): + df.semantics.cluster_by( + "unknown_column", + output_column, + text_embedding_generator, + n_clusters=3, + ) + + +def test_cluster_by_invalid_model(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + + df = dataframe.DataFrame( + ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), + session=session, + ) + + output_column = "cluster id" + with pytest.raises(TypeError): + df.semantics.cluster_by( + "Product", + output_column, + gemini_flash_model, + n_clusters=3, + ) + + +def test_filter(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) + + actual_df = df.semantics.filter( + "{city} is the capital of {country}", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +def test_filter_single_column_reference(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) + + actual_df = df.semantics.filter( + "{country} is in Europe", gemini_flash_model + ).to_pandas() + + expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) + pandas.testing.assert_frame_equal( + actual_df, expected_df, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + "instruction", + [ + "No column reference", + "{city} is in the {non_existing_column}", + ], +) +def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with pytest.raises(ValueError): + df.semantics.filter(instruction, gemini_flash_model) + + +def test_filter_invalid_model_raise_error(): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + ) + + with pytest.raises(ValueError): + df.semantics.filter("{city} is the capital of {country}", None) + + +def test_map(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + }, + session=session, + ) + + actual_df = df.semantics.map( + "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ).to_pandas() + # Result sanitation + actual_df["food"] = actual_df["food"].str.strip().str.lower() + + expected_df = pd.DataFrame( + { + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "food": ["burger", "tofu"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "instruction", + [ + "No column reference", + "What is the food made from {ingredient_1} and {non_existing_column}?}", + ], +) +def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + } + ) + + with pytest.raises(ValueError): + df.semantics.map(instruction, "food", gemini_flash_model) + + +def test_map_invalid_model_raise_error(): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + }, + ) + + with pytest.raises(ValueError): + df.semantics.map( + "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + None, + ) + + +def test_join(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + actual_df = cities.semantics.join( + countries, + "{city} belongs to {country}", + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "city": ["Seattle", "Berlin"], + "country": ["USA", "Germany"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +def test_self_join(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + animals = dataframe.DataFrame( + data={ + "animal": ["spider", "capybara"], + }, + session=session, + ) + + actual_df = animals.semantics.join( + animals, + "{animal_left} is heavier than {animal_right}", + gemini_flash_model, + ).to_pandas() + + expected_df = pd.DataFrame( + { + "animal_left": ["capybara"], + "animal_right": ["spider"], + } + ) + pandas.testing.assert_frame_equal( + actual_df, + expected_df, + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +def test_join_data_too_large_raise_error(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with pytest.raises(ValueError): + cities.semantics.join( + countries, "{city} belongs to {country}", gemini_flash_model, max_rows=1 + ) + + +@pytest.mark.parametrize( + ("instruction", "error_pattern"), + [ + ("No column reference", "No column references"), + pytest.param( + "{city} is in {continent}", r"Column .+ not found", id="non_existing_column" + ), + pytest.param( + "{city} is in {country}", + r"Ambiguous column reference: .+", + id="ambiguous_column", + ), + pytest.param( + "{city_left} is in {country}", + r"Unnecessary suffix for .+", + id="suffix_on_left_unique_column", + ), + pytest.param( + "{city} is in {region_right}", + r"Unnecessary suffix for .+", + id="suffix_on_right_unique_column", + ), + pytest.param( + "{city_right} is in {country}", r"Column .+ not found", id="wrong_suffix" + ), + pytest.param( + "{city} is in {continent_right}", + r"Column .+ not found", + id="suffix_on_non_existing_column", + ), + ], +) +def test_join_invalid_instruction_raise_error( + instruction, error_pattern, gemini_flash_model +): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} + ) + df2 = dataframe.DataFrame( + { + "country": ["USA", "UK", "Germany"], + "region": ["North America", "Europe", "Europe"], + } + ) + + with pytest.raises(ValueError, match=error_pattern): + df1.semantics.join(df2, instruction, gemini_flash_model) + + +def test_join_invalid_model_raise_error(): + bigframes.options.experiments.semantic_operators = True + cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) + countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) + + with pytest.raises(ValueError): + cities.semantics.join(countries, "{city} is in {country}", None) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_search(session, text_embedding_generator, score_column): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + actual_result = df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + score_column=score_column, + ).to_pandas() + + expected_result = pd.Series( + ["baboons", "chimpanzee"], index=[2, 4], name="creatures" + ) + pandas.testing.assert_series_equal( + actual_result["creatures"], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 1 + else: + assert score_column in actual_result.columns + + +def test_search_invalid_column_raises_error(session, text_embedding_generator): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with pytest.raises(ValueError): + df.semantics.search( + "whatever", "monkey", top_k=2, model=text_embedding_generator + ) + + +def test_search_invalid_model_raises_error(session): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with pytest.raises(TypeError): + df.semantics.search("creatures", "monkey", top_k=2, model=None) + + +@pytest.mark.parametrize( + "score_column", + [ + pytest.param(None, id="no_score_column"), + pytest.param("distance", id="has_score_column"), + ], +) +def test_sim_join(session, text_embedding_generator, score_column): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + actual_result = df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + score_column=score_column, + ).to_pandas() + + expected_result = pd.DataFrame( + {"creatures": ["salmon", "cat"], "creatures_1": ["tuna", "dog"]} + ) + pandas.testing.assert_frame_equal( + actual_result[["creatures", "creatures_1"]], + expected_result, + check_dtype=False, + check_index_type=False, + ) + + if score_column is None: + assert len(actual_result.columns) == 2 + else: + assert score_column in actual_result.columns + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + pytest.param("whatever", "creatures", id="incorrect_left_column"), + pytest.param("creatures", "whatever", id="incorrect_right_column"), + ], +) +def test_sim_join_invalid_column_raises_error( + session, text_embedding_generator, left_on, right_on +): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with pytest.raises(ValueError): + df1.semantics.sim_join( + df2, left_on=left_on, right_on=right_on, model=text_embedding_generator + ) + + +def test_sim_join_invalid_model_raises_error(session): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with pytest.raises(TypeError): + df1.semantics.sim_join( + df2, left_on="creatures", right_on="creatures", model=None + ) + + +def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with pytest.raises(ValueError): + df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + max_rows=1, + ) diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py index 204c6b7463..3521e4cd20 100644 --- a/tests/system/large/test_location.py +++ b/tests/system/large/test_location.py @@ -22,7 +22,9 @@ import bigframes.session.clients -def _assert_bq_execution_location(session: bigframes.Session): +def _assert_bq_execution_location( + session: bigframes.Session, expected_location: typing.Optional[str] = None +): df = session.read_gbq( """ SELECT "aaa" as name, 111 as number @@ -33,10 +35,10 @@ def _assert_bq_execution_location(session: bigframes.Session): """ ) - assert ( - typing.cast(bigquery.QueryJob, df.query_job).location - == session.bqclient.location - ) + if expected_location is None: + expected_location = session._location + + assert typing.cast(bigquery.QueryJob, df.query_job).location == expected_location result = ( df[["name", "number"]] @@ -47,8 +49,7 @@ def _assert_bq_execution_location(session: bigframes.Session): ) assert ( - typing.cast(bigquery.QueryJob, result.query_job).location - == session.bqclient.location + typing.cast(bigquery.QueryJob, result.query_job).location == expected_location ) @@ -87,6 +88,30 @@ def test_bq_location(bigquery_location): _assert_bq_execution_location(session) +@pytest.mark.parametrize( + ("set_location", "resolved_location"), + # Sort the set to avoid nondeterminism. + [ + (loc.capitalize(), loc) + for loc in sorted(bigframes.constants.ALL_BIGQUERY_LOCATIONS) + ], +) +def test_bq_location_non_canonical(set_location, resolved_location): + session = bigframes.Session( + context=bigframes.BigQueryOptions(location=set_location) + ) + + assert session.bqclient.location == resolved_location + + # by default global endpoint is used + assert ( + session.bqclient._connection.API_BASE_URL == "https://bigquery.googleapis.com" + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session, resolved_location) + + @pytest.mark.parametrize( "bigquery_location", # Sort the set to avoid nondeterminism. diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 18d2609347..2365002857 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1670,7 +1670,11 @@ def analyze(row): (3, 4): ["pq", "rs", "tu"], (5.0, "six", 7): [8, 9, 10], 'raise Exception("hacked!")': [11, 12, 13], - } + }, + # Default pandas index has non-numpy type, whereas bigframes is + # always numpy-based type, so let's use the index compatible + # with bigframes. See more details in b/369689696. + index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()), ), id="all-kinds-of-column-names", ), @@ -1681,17 +1685,22 @@ def analyze(row): "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"], }, - index=pandas.MultiIndex.from_tuples( - [ - ("a", 100), - ("a", 200), - ("b", 300), - ] + index=pandas.MultiIndex.from_frame( + pandas.DataFrame( + { + "idx0": pandas.Series( + ["a", "a", "b"], dtype=pandas.StringDtype() + ), + "idx1": pandas.Series( + [100, 200, 300], dtype=pandas.Int64Dtype() + ), + } + ) ), ), id="multiindex", marks=pytest.mark.skip( - reason="TODO(b/368639580) revert this skip after fix" + reason="TODO: revert this skip after this pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/59908" ), ), pytest.param( @@ -1701,6 +1710,10 @@ def analyze(row): [20, 3.75, "rs"], [30, 8.0, "tu"], ], + # Default pandas index has non-numpy type, whereas bigframes is + # always numpy-based type, so let's use the index compatible + # with bigframes. See more details in b/369689696. + index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()), columns=pandas.MultiIndex.from_arrays( [ ["first", "last_two", "last_two"], @@ -1729,10 +1742,8 @@ def test_df_apply_axis_1_complex(session, pd_df): def serialize_row(row): custom = { - "name": row.name.item() if hasattr(row.name, "item") else row.name, - "index": [ - idx.item() if hasattr(idx, "item") else idx for idx in row.index - ], + "name": row.name, + "index": [idx for idx in row.index], "values": [ val.item() if hasattr(val, "item") else val for val in row.values ], @@ -1756,12 +1767,7 @@ def serialize_row(row): bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas() pd_result = pd_df.apply(serialize_row, axis=1) - # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' - # , ignore this mismatch by using check_dtype=False. - # - # bf_result.index[0].dtype is 'string[pyarrow]' while - # pd_result.index[0].dtype is 'object', ignore this mismatch by using - # check_index_type=False. + # ignore known dtype difference between pandas and bigframes pandas.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 51b45485ad..4b0f50973b 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -38,30 +38,6 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): return session.read_pandas(llm_remote_text_pandas_df) -@pytest.mark.flaky(retries=2) -def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): - model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) - - X_train = llm_fine_tune_df_default_index[["prompt"]] - y_train = llm_fine_tune_df_default_index[["label"]] - model.fit(X_train, y_train) - - assert model is not None - - df = model.predict(llm_remote_text_df["prompt"]).to_pandas() - utils.check_pandas_df_schema_and_index( - df, - columns=[ - "ml_generate_text_llm_result", - "ml_generate_text_rai_result", - "ml_generate_text_status", - "prompt", - ], - index=3, - ) - # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept - - @pytest.mark.flaky(retries=2) def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): model = llm.GeminiTextGenerator(model_name="gemini-pro", max_iterations=1) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8c2912edd4..1fb12d3f82 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -591,15 +591,19 @@ def test_join_repr(scalars_dfs_maybe_ordered): assert actual == expected -def test_repr_html_w_all_rows(scalars_dfs): +def test_repr_html_w_all_rows(scalars_dfs, session): + metrics = session._metrics scalars_df, _ = scalars_dfs # get a pandas df of the expected format df, _ = scalars_df._block.to_pandas() pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) pandas_df.index.name = scalars_df.index.name + executions_pre = metrics.execution_count # When there are 10 or fewer rows, the outputs should be identical except for the extra note. actual = scalars_df.head(10)._repr_html_() + executions_post = metrics.execution_count + with display_options.pandas_repr(bigframes.options.display): pandas_repr = pandas_df.head(10)._repr_html_() @@ -608,6 +612,7 @@ def test_repr_html_w_all_rows(scalars_dfs): + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" ) assert actual == expected + assert (executions_post - executions_pre) <= 2 def test_df_column_name_with_space(scalars_dfs): @@ -1516,6 +1521,30 @@ def test_shape(scalars_dfs): assert bf_result == pd_result +@pytest.mark.parametrize( + "reference_table, test_table", + [ + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_mat_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.csv_native_table", + "bigframes-dev.bigframes_tests_sys.csv_external_table", + ), + ], +) +def test_view_and_external_table_shape(session, reference_table, test_table): + reference_df = session.read_gbq(reference_table) + test_df = session.read_gbq(test_table) + + assert test_df.shape == reference_df.shape + + def test_len(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = len(scalars_df) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 8574860daa..2d5ae21bb4 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -421,6 +421,34 @@ def test_dataframe_groupby_getitem( pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_getitem_error( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + with pytest.raises(KeyError, match="\"Columns not found: 'not_in_group'\""): + ( + scalars_df_index[col_names] + .groupby("string_col")["not_in_group"] + .min() + .to_pandas() + ) + + +def test_dataframe_groupby_getitem_multiple_columns_error( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + with pytest.raises(KeyError, match="\"Columns not found: 'col1', 'col2'\""): + ( + scalars_df_index[col_names] + .groupby("string_col")["col1", "col2"] + .min() + .to_pandas() + ) + + def test_dataframe_groupby_getitem_list( scalars_df_index, scalars_pandas_df_index, diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index ab2a9c19b8..cab74f617d 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1178,7 +1178,7 @@ def test_column_multi_index_dot_not_supported(): bf1 @ bf2 -def test_explode_w_multi_index(): +def test_explode_w_column_multi_index(): data = [[[1, 1], np.nan, [3, 3]], [[2], [5], []]] multi_level_columns = pandas.MultiIndex.from_arrays( [["col0", "col0", "col1"], ["col00", "col01", "col11"]] @@ -1197,6 +1197,24 @@ def test_explode_w_multi_index(): ) +def test_explode_w_multi_index(): + data = [[[1, 1], np.nan, [3, 3]], [[2], [5], []]] + columns = ["col00", "col01", "col11"] + multi_index = pandas.MultiIndex.from_frame( + pandas.DataFrame({"idx0": [5, 1], "idx1": ["z", "x"]}) + ) + + df = bpd.DataFrame(data, index=multi_index, columns=columns) + pd_df = df.to_pandas() + + pandas.testing.assert_frame_equal( + df.explode("col00").to_pandas(), + pd_df.explode("col00"), + check_dtype=False, + check_index_type=False, + ) + + def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "c", "d"]) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 624e287f8d..f1c60664a1 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3852,6 +3852,28 @@ def test_series_explode(data): pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), pytest.param(["z", "x", "a", "b"], True, id="str_index"), pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), True, id="str_named_index" + ), + pytest.param( + pd.Index(["z", "x", "a", "b"], name="idx"), + False, + id="ignore_str_named_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + True, + id="multi_index", + ), + pytest.param( + pd.MultiIndex.from_frame( + pd.DataFrame({"idx0": [5, 1, 3, 2], "idx1": ["z", "x", "a", "b"]}) + ), + False, + id="ignore_multi_index", + ), ], ) def test_series_explode_w_index(index, ignore_index): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 17e8b99704..4b48915d2d 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -390,9 +390,16 @@ def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id): assert df3 is not None -def test_read_gbq_on_linked_dataset_warns(session): +@pytest.mark.parametrize( + "source_table", + [ + "bigframes-dev.thelook_ecommerce.orders", + "bigframes-dev.bigframes_tests_sys.base_table_mat_view", + ], +) +def test_read_gbq_on_linked_dataset_warns(session, source_table): with warnings.catch_warnings(record=True) as warned: - session.read_gbq("bigframes-dev.thelook_ecommerce.orders") + session.read_gbq(source_table) assert len(warned) == 1 assert warned[0].category == bigframes.exceptions.TimeTravelDisabledWarning diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index b827b0723d..d04b5bd575 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -90,20 +90,33 @@ def test_setter_if_session_started_but_setting_the_same_value(attribute): [ (None,), ("us-central1",), + ("us-Central1",), + ("US-CENTRAL1",), + ("US",), + ("us",), ], ) def test_location_set_to_valid_no_warning(valid_location): - options = bigquery_options.BigQueryOptions() - # Ensure that no warnings are emitted. - # https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests - with warnings.catch_warnings(): - # Turn matching UnknownLocationWarning into exceptions. - # https://docs.python.org/3/library/warnings.html#warning-filter - warnings.simplefilter( - "error", category=bigframes.exceptions.UnknownLocationWarning - ) + # test setting location through constructor + def set_location_in_ctor(): + bigquery_options.BigQueryOptions(location=valid_location) + + # test setting location property + def set_location_property(): + options = bigquery_options.BigQueryOptions() options.location = valid_location + for op in [set_location_in_ctor, set_location_property]: + # Ensure that no warnings are emitted. + # https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests + with warnings.catch_warnings(): + # Turn matching UnknownLocationWarning into exceptions. + # https://docs.python.org/3/library/warnings.html#warning-filter + warnings.simplefilter( + "error", category=bigframes.exceptions.UnknownLocationWarning + ) + op() + @pytest.mark.parametrize( [ @@ -122,11 +135,20 @@ def test_location_set_to_valid_no_warning(valid_location): ], ) def test_location_set_to_invalid_warning(invalid_location, possibility): - options = bigquery_options.BigQueryOptions() - with pytest.warns( - bigframes.exceptions.UnknownLocationWarning, - match=re.escape( - f"The location '{invalid_location}' is set to an unknown value. Did you mean '{possibility}'?" - ), - ): + # test setting location through constructor + def set_location_in_ctor(): + bigquery_options.BigQueryOptions(location=invalid_location) + + # test setting location property + def set_location_property(): + options = bigquery_options.BigQueryOptions() options.location = invalid_location + + for op in [set_location_in_ctor, set_location_property]: + with pytest.warns( + bigframes.exceptions.UnknownLocationWarning, + match=re.escape( + f"The location '{invalid_location}' is set to an unknown value. Did you mean '{possibility}'?" + ), + ): + op() diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py new file mode 100644 index 0000000000..49c3d9e53c --- /dev/null +++ b/tests/unit/_config/test_experiment_options.py @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes._config.experiment_options as experiment_options + + +def test_semantic_operators_default_false(): + options = experiment_options.ExperimentOptions() + + assert options.semantic_operators is False + + +def test_semantic_operators_set_true_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(UserWarning): + options.semantic_operators = True + + assert options.semantic_operators is True diff --git a/third_party/bigframes_vendored/db_benchmark/groupby_queries.py b/third_party/bigframes_vendored/db_benchmark/groupby_queries.py index 672a9b5d5f..7758496db5 100644 --- a/third_party/bigframes_vendored/db_benchmark/groupby_queries.py +++ b/third_party/bigframes_vendored/db_benchmark/groupby_queries.py @@ -4,10 +4,10 @@ import bigframes.session -def q1(table_id: str, session: bigframes.Session): +def q1(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 1: sum v1 by id1") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby("id1", as_index=False, dropna=False).agg({"v1": "sum"}) print(ans.shape) @@ -15,10 +15,10 @@ def q1(table_id: str, session: bigframes.Session): print(chk) -def q2(table_id: str, session: bigframes.Session): +def q2(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 2: sum v1 by id1:id2") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby(["id1", "id2"], as_index=False, dropna=False).agg({"v1": "sum"}) print(ans.shape) @@ -26,10 +26,10 @@ def q2(table_id: str, session: bigframes.Session): print(chk) -def q3(table_id: str, session: bigframes.Session): +def q3(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 3: sum v1 mean v3 by id3") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby("id3", as_index=False, dropna=False).agg( {"v1": "sum", "v3": "mean"} @@ -39,10 +39,10 @@ def q3(table_id: str, session: bigframes.Session): print(chk) -def q4(table_id: str, session: bigframes.Session): +def q4(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 4: mean v1:v3 by id4") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby("id4", as_index=False, dropna=False).agg( {"v1": "mean", "v2": "mean", "v3": "mean"} @@ -52,10 +52,10 @@ def q4(table_id: str, session: bigframes.Session): print(chk) -def q5(table_id: str, session: bigframes.Session): +def q5(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 5: sum v1:v3 by id6") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby("id6", as_index=False, dropna=False).agg( {"v1": "sum", "v2": "sum", "v3": "sum"} @@ -65,10 +65,10 @@ def q5(table_id: str, session: bigframes.Session): print(chk) -def q6(table_id: str, session: bigframes.Session): +def q6(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 6: median v3 sd v3 by id4 id5") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby(["id4", "id5"], as_index=False, dropna=False).agg( {"v3": ["median", "std"]} @@ -78,10 +78,10 @@ def q6(table_id: str, session: bigframes.Session): print(chk) -def q7(table_id: str, session: bigframes.Session): +def q7(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 7: max v1 - min v2 by id3") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = ( x.groupby("id3", as_index=False, dropna=False) @@ -93,10 +93,10 @@ def q7(table_id: str, session: bigframes.Session): print(chk) -def q8(table_id: str, session: bigframes.Session): +def q8(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 8: largest two v3 by id6") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = ( x[~x["v3"].isna()][["id6", "v3"]] @@ -110,10 +110,10 @@ def q8(table_id: str, session: bigframes.Session): print(chk) -def q10(table_id: str, session: bigframes.Session): +def q10(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Groupby benchmark 10: sum v3 count by id1:id6") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.groupby( ["id1", "id2", "id3", "id4", "id5", "id6"], as_index=False, dropna=False diff --git a/third_party/bigframes_vendored/db_benchmark/join_queries.py b/third_party/bigframes_vendored/db_benchmark/join_queries.py index 0c01e427a6..f0073436c0 100644 --- a/third_party/bigframes_vendored/db_benchmark/join_queries.py +++ b/third_party/bigframes_vendored/db_benchmark/join_queries.py @@ -4,12 +4,12 @@ import bigframes -def q1(table_id: str, session: bigframes.Session): +def q1(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Join benchmark 1: small inner on int") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") small = session.read_gbq( - f"bigframes-dev-perf.dbbenchmark.{_get_join_table_id(table_id, 'small')}" + f"{project_id}.{dataset_id}.{_get_join_table_id(table_id, 'small')}" ) ans = x.merge(small, on="id1") @@ -19,12 +19,12 @@ def q1(table_id: str, session: bigframes.Session): print(chk) -def q2(table_id: str, session: bigframes.Session): +def q2(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Join benchmark 2: medium inner on int") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") medium = session.read_gbq( - f"bigframes-dev-perf.dbbenchmark.{_get_join_table_id(table_id, 'medium')}" + f"{project_id}.{dataset_id}.{_get_join_table_id(table_id, 'medium')}" ) ans = x.merge(medium, on="id2") @@ -34,12 +34,12 @@ def q2(table_id: str, session: bigframes.Session): print(chk) -def q3(table_id: str, session: bigframes.Session): +def q3(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Join benchmark 3: medium outer on int") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") medium = session.read_gbq( - f"bigframes-dev-perf.dbbenchmark.{_get_join_table_id(table_id, 'medium')}" + f"{project_id}.{dataset_id}.{_get_join_table_id(table_id, 'medium')}" ) ans = x.merge(medium, how="left", on="id2") @@ -49,12 +49,12 @@ def q3(table_id: str, session: bigframes.Session): print(chk) -def q4(table_id: str, session: bigframes.Session): +def q4(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Join benchmark 4: medium inner on factor") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") medium = session.read_gbq( - f"bigframes-dev-perf.dbbenchmark.{_get_join_table_id(table_id, 'medium')}" + f"{project_id}.{dataset_id}.{_get_join_table_id(table_id, 'medium')}" ) ans = x.merge(medium, on="id5") @@ -64,12 +64,12 @@ def q4(table_id: str, session: bigframes.Session): print(chk) -def q5(table_id: str, session: bigframes.Session): +def q5(project_id: str, dataset_id: str, table_id: str, session: bigframes.Session): print("Join benchmark 5: big inner on int") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") big = session.read_gbq( - f"bigframes-dev-perf.dbbenchmark.{_get_join_table_id(table_id, 'big')}" + f"{project_id}.{dataset_id}.{_get_join_table_id(table_id, 'big')}" ) ans = x.merge(big, on="id3") diff --git a/third_party/bigframes_vendored/db_benchmark/sort_queries.py b/third_party/bigframes_vendored/db_benchmark/sort_queries.py index 600df103cf..bbaf46cf27 100644 --- a/third_party/bigframes_vendored/db_benchmark/sort_queries.py +++ b/third_party/bigframes_vendored/db_benchmark/sort_queries.py @@ -4,10 +4,12 @@ import bigframes.session -def q1(table_id: str, session: bigframes.Session) -> None: +def q1( + project_id: str, dataset_id: str, table_id: str, session: bigframes.Session +) -> None: print("Sort benchmark 1: sort by int id2") - x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + x = session.read_gbq(f"{project_id}.{dataset_id}.{table_id}") ans = x.sort_values("id2") print(ans.shape) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index c07f26bc6f..75f66191ca 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.21.0" +__version__ = "1.22.0"