Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions bigframes/_config/sampling_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

"""Options for downsampling."""

from __future__ import annotations

import dataclasses
from typing import Literal, Optional

Expand All @@ -25,6 +27,28 @@ class SamplingOptions:
__doc__ = vendored_pandas_config.sampling_options_doc

max_download_size: Optional[int] = 500
# Enable downsampling
enable_downsampling: bool = False
sampling_method: Literal["head", "uniform"] = "uniform"
random_state: Optional[int] = None

def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions:
return SamplingOptions(
max_rows, self.enable_downsampling, self.sampling_method, self.random_state
)

def with_method(self, method: Literal["head", "uniform"]) -> SamplingOptions:
return SamplingOptions(self.max_download_size, True, method, self.random_state)

def with_random_state(self, state: Optional[int]) -> SamplingOptions:
return SamplingOptions(
self.max_download_size,
self.enable_downsampling,
self.sampling_method,
state,
)

def with_disabled(self) -> SamplingOptions:
return SamplingOptions(
self.max_download_size, False, self.sampling_method, self.random_state
)
11 changes: 5 additions & 6 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@
import ibis.expr.types as ibis_types
import pandas

import bigframes.core.compile.compiled as compiled
import bigframes.core.compile.compiler as compiler
import bigframes.core.compile as compiling
import bigframes.core.expression as expressions
import bigframes.core.guid
import bigframes.core.nodes as nodes
Expand Down Expand Up @@ -104,11 +103,11 @@ def _try_evaluate_local(self):
def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
return self._compile_ordered().get_column_type(key)

def _compile_ordered(self) -> compiled.OrderedIR:
return compiler.compile_ordered(self.node)
def _compile_ordered(self) -> compiling.OrderedIR:
return compiling.compile_ordered(self.node)

def _compile_unordered(self) -> compiled.UnorderedIR:
return compiler.compile_unordered(self.node)
def _compile_unordered(self) -> compiling.UnorderedIR:
return compiling.compile_unordered(self.node)

def row_count(self) -> ArrayValue:
"""Get number of rows in ArrayValue as a single-entry ArrayValue."""
Expand Down
158 changes: 79 additions & 79 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from __future__ import annotations

import dataclasses
import functools
import itertools
import random
Expand All @@ -31,6 +32,7 @@
import google.cloud.bigquery as bigquery
import pandas as pd

import bigframes._config.sampling_options as sampling_options
import bigframes.constants as constants
import bigframes.core as core
import bigframes.core.guid as guid
Expand Down Expand Up @@ -80,6 +82,14 @@ def _get_block(self) -> Block:
"""Get the underlying block value of the object"""


@dataclasses.dataclass()
class MaterializationOptions:
downsampling: sampling_options.SamplingOptions = dataclasses.field(
default_factory=sampling_options.SamplingOptions
)
ordered: bool = True


class Block:
"""A immutable 2D data structure."""

Expand Down Expand Up @@ -395,23 +405,31 @@ def _to_dataframe(self, result) -> pd.DataFrame:

def to_pandas(
self,
value_keys: Optional[Iterable[str]] = None,
max_results: Optional[int] = None,
max_download_size: Optional[int] = None,
sampling_method: Optional[str] = None,
random_state: Optional[int] = None,
*,
ordered: bool = True,
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
"""Run query and download results as a pandas DataFrame."""
if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS):
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)

df, _, query_job = self._compute_and_count(
value_keys=value_keys,
max_results=max_results,
max_download_size=max_download_size,
sampling_method=sampling_method,
random_state=random_state,
ordered=ordered,
sampling = bigframes.options.sampling.with_max_download_size(max_download_size)
if sampling_method is not None:
sampling = sampling.with_method(sampling_method).with_random_state( # type: ignore
random_state
)
else:
sampling = sampling.with_disabled()

df, query_job = self._materialize_local(
materialize_options=MaterializationOptions(
downsampling=sampling, ordered=ordered
)
)
return df, query_job

Expand Down Expand Up @@ -439,57 +457,29 @@ def _copy_index_to_pandas(self, df: pd.DataFrame):
# See: https://github.com/pandas-dev/pandas-stubs/issues/804
df.index.names = self.index.names # type: ignore

def _compute_and_count(
self,
value_keys: Optional[Iterable[str]] = None,
max_results: Optional[int] = None,
max_download_size: Optional[int] = None,
sampling_method: Optional[str] = None,
random_state: Optional[int] = None,
*,
ordered: bool = True,
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
def _materialize_local(
self, materialize_options: MaterializationOptions = MaterializationOptions()
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
"""Run query and download results as a pandas DataFrame. Return the total number of results as well."""
# TODO(swast): Allow for dry run and timeout.
enable_downsampling = (
True
if sampling_method is not None
else bigframes.options.sampling.enable_downsampling
)

max_download_size = (
max_download_size or bigframes.options.sampling.max_download_size
)

random_state = random_state or bigframes.options.sampling.random_state

if sampling_method is None:
sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
sampling_method = sampling_method.lower()

if sampling_method not in _SAMPLING_METHODS:
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)

expr = self._apply_value_keys_to_expr(value_keys=value_keys)

results_iterator, query_job = self.session._execute(
expr, max_results=max_results, sorted=ordered
self.expr, sorted=materialize_options.ordered
)

table_size = (
self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES
)
sample_config = materialize_options.downsampling
max_download_size = sample_config.max_download_size
fraction = (
max_download_size / table_size
if (max_download_size is not None) and (table_size != 0)
else 2
)

# TODO: Maybe materialize before downsampling
# Some downsampling methods
if fraction < 1:
if not enable_downsampling:
if not sample_config.enable_downsampling:
raise RuntimeError(
f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"
Expand All @@ -507,42 +497,53 @@ def _compute_and_count(
"\nPlease refer to the documentation for configuring the downloading limit.",
UserWarning,
)
if sampling_method == _HEAD:
total_rows = int(results_iterator.total_rows * fraction)
results_iterator.max_results = total_rows
df = self._to_dataframe(results_iterator)

if self.index_columns:
df.set_index(list(self.index_columns), inplace=True)
df.index.names = self.index.names # type: ignore
elif (sampling_method == _UNIFORM) and (random_state is None):
filtered_expr = self.expr._uniform_sampling(fraction)
block = Block(
filtered_expr,
index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self.index.names,
)
df, total_rows, _ = block._compute_and_count(max_download_size=None)
elif sampling_method == _UNIFORM:
block = self._split(
fracs=(max_download_size / table_size,),
random_state=random_state,
preserve_order=True,
)[0]
df, total_rows, _ = block._compute_and_count(max_download_size=None)
else:
# This part should never be called, just in case.
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)
total_rows = results_iterator.total_rows
# Remove downsampling config from subsequent invocations, as otherwise could result in many
# iterations if downsampling undershoots
return self._downsample(
total_rows=total_rows,
sampling_method=sample_config.sampling_method,
fraction=fraction,
random_state=sample_config.random_state,
)._materialize_local(
MaterializationOptions(ordered=materialize_options.ordered)
)
else:
total_rows = results_iterator.total_rows
df = self._to_dataframe(results_iterator)
self._copy_index_to_pandas(df)

return df, total_rows, query_job
return df, query_job

def _downsample(
self, total_rows: int, sampling_method: str, fraction: float, random_state
) -> Block:
# either selecting fraction or number of rows
if sampling_method == _HEAD:
filtered_block = self.slice(stop=int(total_rows * fraction))
return filtered_block
elif (sampling_method == _UNIFORM) and (random_state is None):
filtered_expr = self.expr._uniform_sampling(fraction)
block = Block(
filtered_expr,
index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self.index.names,
)
return block
elif sampling_method == _UNIFORM:
block = self._split(
fracs=(fraction,),
random_state=random_state,
preserve_order=True,
)[0]
return block
else:
# This part should never be called, just in case.
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)

def _split(
self,
Expand Down Expand Up @@ -1209,10 +1210,9 @@ def retrieve_repr_request_results(
count = self.shape[0]
if count > max_results:
head_block = self.slice(0, max_results)
computed_df, query_job = head_block.to_pandas(max_results=max_results)
else:
head_block = self
computed_df, query_job = head_block.to_pandas()
computed_df, query_job = head_block.to_pandas()
formatted_df = computed_df.set_axis(self.column_labels, axis=1)
# we reset the axis and substitute the bf index name for the default
formatted_df.index.name = self.index.name
Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/compile/row_identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

import bigframes.constants as constants
import bigframes.core.compile.compiled as compiled
import bigframes.core.joins.name_resolution as naming
import bigframes.core.joins as joining
import bigframes.core.ordering as orderings

SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"}
Expand Down Expand Up @@ -68,7 +68,7 @@ def join_by_row_identity_unordered(
right_mask = right_relative_predicates if how in ["left", "outer"] else None

# Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result
map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER(
map_left_id, map_right_id = joining.JOIN_NAME_REMAPPER(
left.column_ids, right.column_ids
)
joined_columns = [
Expand Down Expand Up @@ -125,10 +125,10 @@ def join_by_row_identity_ordered(
right_mask = right_relative_predicates if how in ["left", "outer"] else None

# Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result
lpublicmapping, rpublicmapping = naming.JOIN_NAME_REMAPPER(
lpublicmapping, rpublicmapping = joining.JOIN_NAME_REMAPPER(
left.column_ids, right.column_ids
)
lhiddenmapping, rhiddenmapping = naming.JoinNameRemapper(namespace="hidden")(
lhiddenmapping, rhiddenmapping = joining.JoinNameRemapper(namespace="hidden")(
left._hidden_column_ids, right._hidden_column_ids
)
map_left_id = {**lpublicmapping, **lhiddenmapping}
Expand Down
3 changes: 1 addition & 2 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,13 @@ def to_pandas(
is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame.
"""
df, query_job = self._block.to_pandas(
(self._value_column,),
max_download_size=max_download_size,
sampling_method=sampling_method,
random_state=random_state,
ordered=ordered,
)
self._set_internal_query_job(query_job)
series = df[self._value_column]
series = df.squeeze(axis=1)
series.name = self._name
return series

Expand Down
2 changes: 0 additions & 2 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1497,7 +1497,6 @@ def _execute(
self,
array_value: core.ArrayValue,
job_config: Optional[bigquery.job.QueryJobConfig] = None,
max_results: Optional[int] = None,
*,
sorted: bool = True,
dry_run=False,
Expand All @@ -1507,7 +1506,6 @@ def _execute(
return self._start_query(
sql=sql,
job_config=job_config,
max_results=max_results,
)

def _to_sql(
Expand Down