From 9aae0cbcda8d8a7df0805bd1f2865928360cf850 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 5 Jan 2024 23:36:26 +0000 Subject: [PATCH 1/2] refactor: refactor block materialization --- bigframes/_config/sampling_options.py | 24 ++++ bigframes/core/blocks.py | 156 +++++++++++++------------- bigframes/series.py | 3 +- bigframes/session/__init__.py | 2 - 4 files changed, 102 insertions(+), 83 deletions(-) diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py index 1742dabe17..a80b9601ca 100644 --- a/bigframes/_config/sampling_options.py +++ b/bigframes/_config/sampling_options.py @@ -14,6 +14,8 @@ """Options for downsampling.""" +from __future__ import annotations + import dataclasses from typing import Literal, Optional @@ -25,6 +27,28 @@ class SamplingOptions: __doc__ = vendored_pandas_config.sampling_options_doc max_download_size: Optional[int] = 500 + # Enable downsampling enable_downsampling: bool = False sampling_method: Literal["head", "uniform"] = "uniform" random_state: Optional[int] = None + + def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions: + return SamplingOptions( + max_rows, self.enable_downsampling, self.sampling_method, self.random_state + ) + + def with_method(self, method: Literal["head", "uniform"]) -> SamplingOptions: + return SamplingOptions(self.max_download_size, True, method, self.random_state) + + def with_random_state(self, state: Optional[int]) -> SamplingOptions: + return SamplingOptions( + self.max_download_size, + self.enable_downsampling, + self.sampling_method, + state, + ) + + def with_disabled(self) -> SamplingOptions: + return SamplingOptions( + self.max_download_size, False, self.sampling_method, self.random_state + ) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e88326795c..07164cedc3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -21,6 +21,7 @@ from __future__ import annotations +import dataclasses import functools import itertools import random @@ -31,6 +32,7 @@ import google.cloud.bigquery as bigquery import pandas as pd +import bigframes._config.sampling_options as sampling_options import bigframes.constants as constants import bigframes.core as core import bigframes.core.guid as guid @@ -80,6 +82,12 @@ def _get_block(self) -> Block: """Get the underlying block value of the object""" +@dataclasses.dataclass(frozen=True) +class MaterializationOptions: + downsampling: sampling_options.SamplingOptions = sampling_options.SamplingOptions() + ordered: bool = True + + class Block: """A immutable 2D data structure.""" @@ -395,8 +403,6 @@ def _to_dataframe(self, result) -> pd.DataFrame: def to_pandas( self, - value_keys: Optional[Iterable[str]] = None, - max_results: Optional[int] = None, max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, @@ -404,14 +410,24 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" + if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) - df, _, query_job = self._compute_and_count( - value_keys=value_keys, - max_results=max_results, - max_download_size=max_download_size, - sampling_method=sampling_method, - random_state=random_state, - ordered=ordered, + sampling = bigframes.options.sampling.with_max_download_size(max_download_size) + if sampling_method is not None: + sampling = sampling.with_method(sampling_method).with_random_state( # type: ignore + random_state + ) + else: + sampling = sampling.with_disabled() + + df, query_job = self._materialize_local( + materialize_options=MaterializationOptions( + downsampling=sampling, ordered=ordered + ) ) return df, query_job @@ -439,57 +455,29 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): # See: https://github.com/pandas-dev/pandas-stubs/issues/804 df.index.names = self.index.names # type: ignore - def _compute_and_count( - self, - value_keys: Optional[Iterable[str]] = None, - max_results: Optional[int] = None, - max_download_size: Optional[int] = None, - sampling_method: Optional[str] = None, - random_state: Optional[int] = None, - *, - ordered: bool = True, - ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: + def _materialize_local( + self, materialize_options: MaterializationOptions = MaterializationOptions() + ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - enable_downsampling = ( - True - if sampling_method is not None - else bigframes.options.sampling.enable_downsampling - ) - - max_download_size = ( - max_download_size or bigframes.options.sampling.max_download_size - ) - - random_state = random_state or bigframes.options.sampling.random_state - - if sampling_method is None: - sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM - sampling_method = sampling_method.lower() - - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) - - expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = self.session._execute( - expr, max_results=max_results, sorted=ordered + self.expr, sorted=materialize_options.ordered ) - table_size = ( self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES ) + sample_config = materialize_options.downsampling + max_download_size = sample_config.max_download_size fraction = ( max_download_size / table_size if (max_download_size is not None) and (table_size != 0) else 2 ) + # TODO: Maybe materialize before downsampling + # Some downsampling methods if fraction < 1: - if not enable_downsampling: + if not sample_config.enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" @@ -507,42 +495,53 @@ def _compute_and_count( "\nPlease refer to the documentation for configuring the downloading limit.", UserWarning, ) - if sampling_method == _HEAD: - total_rows = int(results_iterator.total_rows * fraction) - results_iterator.max_results = total_rows - df = self._to_dataframe(results_iterator) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore - elif (sampling_method == _UNIFORM) and (random_state is None): - filtered_expr = self.expr._uniform_sampling(fraction) - block = Block( - filtered_expr, - index_columns=self.index_columns, - column_labels=self.column_labels, - index_labels=self.index.names, - ) - df, total_rows, _ = block._compute_and_count(max_download_size=None) - elif sampling_method == _UNIFORM: - block = self._split( - fracs=(max_download_size / table_size,), - random_state=random_state, - preserve_order=True, - )[0] - df, total_rows, _ = block._compute_and_count(max_download_size=None) - else: - # This part should never be called, just in case. - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) + total_rows = results_iterator.total_rows + # Remove downsampling config from subsequent invocations, as otherwise could result in many + # iterations if downsampling undershoots + return self._downsample( + total_rows=total_rows, + sampling_method=sample_config.sampling_method, + fraction=fraction, + random_state=sample_config.random_state, + )._materialize_local( + MaterializationOptions(ordered=materialize_options.ordered) + ) else: total_rows = results_iterator.total_rows df = self._to_dataframe(results_iterator) self._copy_index_to_pandas(df) - return df, total_rows, query_job + return df, query_job + + def _downsample( + self, total_rows: int, sampling_method: str, fraction: float, random_state + ) -> Block: + # either selecting fraction or number of rows + if sampling_method == _HEAD: + filtered_block = self.slice(stop=int(total_rows * fraction)) + return filtered_block + elif (sampling_method == _UNIFORM) and (random_state is None): + filtered_expr = self.expr._uniform_sampling(fraction) + block = Block( + filtered_expr, + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self.index.names, + ) + return block + elif sampling_method == _UNIFORM: + block = self._split( + fracs=(fraction,), + random_state=random_state, + preserve_order=True, + )[0] + return block + else: + # This part should never be called, just in case. + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) def _split( self, @@ -1203,10 +1202,9 @@ def retrieve_repr_request_results( count = self.shape[0] if count > max_results: head_block = self.slice(0, max_results) - computed_df, query_job = head_block.to_pandas(max_results=max_results) else: head_block = self - computed_df, query_job = head_block.to_pandas() + computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name for the default formatted_df.index.name = self.index.name diff --git a/bigframes/series.py b/bigframes/series.py index eefd2b755d..d90fb6a6b9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -301,14 +301,13 @@ def to_pandas( is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. """ df, query_job = self._block.to_pandas( - (self._value_column,), max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, ordered=ordered, ) self._set_internal_query_job(query_job) - series = df[self._value_column] + series = df.squeeze(axis=1) series.name = self._name return series diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 15c262afa7..d503b844aa 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1497,7 +1497,6 @@ def _execute( self, array_value: core.ArrayValue, job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, *, sorted: bool = True, dry_run=False, @@ -1507,7 +1506,6 @@ def _execute( return self._start_query( sql=sql, job_config=job_config, - max_results=max_results, ) def _to_sql( From ad575c947eb11f2e4d46d580d75df96dde83678f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 9 Jan 2024 20:54:05 +0000 Subject: [PATCH 2/2] fix circular import --- bigframes/core/__init__.py | 11 +++++------ bigframes/core/blocks.py | 6 ++++-- bigframes/core/compile/row_identity.py | 8 ++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 7ff23efad3..164d4e932a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -21,8 +21,7 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.core.compile.compiled as compiled -import bigframes.core.compile.compiler as compiler +import bigframes.core.compile as compiling import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -104,11 +103,11 @@ def _try_evaluate_local(self): def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self._compile_ordered().get_column_type(key) - def _compile_ordered(self) -> compiled.OrderedIR: - return compiler.compile_ordered(self.node) + def _compile_ordered(self) -> compiling.OrderedIR: + return compiling.compile_ordered(self.node) - def _compile_unordered(self) -> compiled.UnorderedIR: - return compiler.compile_unordered(self.node) + def _compile_unordered(self) -> compiling.UnorderedIR: + return compiling.compile_unordered(self.node) def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cfc5039862..fd62661018 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -82,9 +82,11 @@ def _get_block(self) -> Block: """Get the underlying block value of the object""" -@dataclasses.dataclass(frozen=True) +@dataclasses.dataclass() class MaterializationOptions: - downsampling: sampling_options.SamplingOptions = sampling_options.SamplingOptions() + downsampling: sampling_options.SamplingOptions = dataclasses.field( + default_factory=sampling_options.SamplingOptions + ) ordered: bool = True diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index 71d53f90dc..7a87a435fe 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -24,7 +24,7 @@ import bigframes.constants as constants import bigframes.core.compile.compiled as compiled -import bigframes.core.joins.name_resolution as naming +import bigframes.core.joins as joining import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} @@ -68,7 +68,7 @@ def join_by_row_identity_unordered( right_mask = right_relative_predicates if how in ["left", "outer"] else None # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER( + map_left_id, map_right_id = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) joined_columns = [ @@ -125,10 +125,10 @@ def join_by_row_identity_ordered( right_mask = right_relative_predicates if how in ["left", "outer"] else None # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - lpublicmapping, rpublicmapping = naming.JOIN_NAME_REMAPPER( + lpublicmapping, rpublicmapping = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) - lhiddenmapping, rhiddenmapping = naming.JoinNameRemapper(namespace="hidden")( + lhiddenmapping, rhiddenmapping = joining.JoinNameRemapper(namespace="hidden")( left._hidden_column_ids, right._hidden_column_ids ) map_left_id = {**lpublicmapping, **lhiddenmapping}