Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .kokoro/docs/common.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ env_vars: {
}
env_vars: {
key: "TRAMPOLINE_BUILD_FILE"
value: "git/bigframes/.kokoro/publish-docs.sh"
value: ".kokoro/publish-docs.sh"
}

env_vars: {
Expand Down
Empty file modified .kokoro/publish-docs.sh
100644 → 100755
Empty file.
9 changes: 6 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ method accepts either a fully-qualified table ID or a SQL query.

import bigframes.pandas as bpd

bpd.options.bigquery.project = your_gcp_project_id
df1 = bpd.read_gbq("project.dataset.table")
df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`")

Expand Down Expand Up @@ -260,7 +261,7 @@ To view and manage Cloud Functions functions, use the
`Functions <https://console.cloud.google.com/functions/list?env=gen2>`_
page and use the project picker to select the project in which you
created the function. For easy identification, the names of the functions
created by BigQuery DataFrames are prefixed by ``bigframes-``.
created by BigQuery DataFrames are prefixed by ``bigframes``.

**Requirements**

Expand All @@ -283,7 +284,9 @@ following IAM roles:
* BigQuery Data Editor (roles/bigquery.dataEditor)
* BigQuery Connection Admin (roles/bigquery.connectionAdmin)
* Cloud Functions Developer (roles/cloudfunctions.developer)
* Service Account User (roles/iam.serviceAccountUser)
* Service Account User (roles/iam.serviceAccountUser) on the
`service account <https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration> `
``[email protected]``
* Storage Object Viewer (roles/storage.objectViewer)
* Project IAM Admin (roles/resourcemanager.projectIamAdmin)

Expand Down Expand Up @@ -330,7 +333,7 @@ Data processing location

BigQuery DataFrames is designed for scale, which it achieves by keeping data
and processing on the BigQuery service. However, you can bring data into the
memory of your client machine by calling ``.execute()`` on a DataFrame or Series
memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series
object. If you choose to do this, the memory limitation of your client machine
applies.

Expand Down
2 changes: 2 additions & 0 deletions bigframes/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@
"Share your usecase with the BigQuery DataFrames team at the "
"https://bit.ly/bigframes-feedback survey."
)

ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}"
102 changes: 86 additions & 16 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
reencode_order_string,
StringEncoding,
)
import bigframes.core.utils as utils
import bigframes.dtypes
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -562,6 +563,36 @@ def aggregate(
ordering=ordering,
)

def corr_aggregate(
self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]]
) -> ArrayValue:
"""
Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id.
This uses BigQuery's CORR under the hood, and thus only Pearson's method is used.
Arguments:
corr_aggregations: left_column_id, right_column_id, output_column_id tuples
"""
table = self.to_ibis_expr(ordering_mode="unordered")
stats = {
col_out: table[col_left].corr(table[col_right], how="pop")
for col_left, col_right, col_out in corr_aggregations
}
aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)}
result = table.aggregate(**aggregates)
# Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it.
ordering = ExpressionOrdering(
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
)
return ArrayValue(
self._session,
result,
columns=[result[col_id] for col_id in [*stats.keys()]],
hidden_ordering_columns=[result[ORDER_ID_COLUMN]],
ordering=ordering,
)

def project_window_op(
self,
column_name: str,
Expand Down Expand Up @@ -852,52 +883,91 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
group_by=group_by,
)

def unpivot_single_row(
def unpivot(
self,
row_labels: typing.Sequence[typing.Hashable],
unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]],
unpivot_columns: typing.Sequence[
typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
],
*,
passthrough_columns: typing.Sequence[str] = (),
index_col_id: str = "index",
dtype=pandas.Float64Dtype(),
dtype: typing.Union[
bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
] = pandas.Float64Dtype(),
) -> ArrayValue:
"""Unpivot a single row."""
# TODO: Generalize to multiple row input
table = self.to_ibis_expr(ordering_mode="unordered")
"""
Unpivot ArrayValue columns.

Args:
row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument.
unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
index_col_id (str): The column id to be used for the row labels.
dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns.

Returns:
ArrayValue: The unpivoted ArrayValue
"""
table = self.to_ibis_expr(ordering_mode="offset_col")
sub_expressions = []

# TODO: validate all columns are equal length, as well as row labels
# Use ibis memtable to infer type of rowlabels (if possible)
# TODO: Allow caller to specify dtype
labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)

row_n = len(row_labels)
if not all(
len(source_columns) == row_n for _, source_columns in unpivot_columns
):
raise ValueError("Columns and row labels must all be same length.")

# Select each column
for i in range(row_n):
values = []
for result_col, source_cols in unpivot_columns:
values.append(
ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col)
)

for j in range(len(unpivot_columns)):
result_col, source_cols = unpivot_columns[j]
col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
if source_cols[i] is not None:
values.append(
ops.AsTypeOp(col_dtype)
._as_ibis(table[source_cols[i]])
.name(result_col)
)
else:
values.append(
bigframes.dtypes.literal_to_ibis_scalar(
None, force_dtype=col_dtype
).name(result_col)
)
offsets_value = (
((table[ORDER_ID_COLUMN] * row_n) + i)
.cast(ibis_dtypes.int64)
.name(ORDER_ID_COLUMN),
)
sub_expr = table.select(
ibis_types.literal(row_labels[i]).name(index_col_id),
passthrough_columns,
bigframes.dtypes.literal_to_ibis_scalar(
row_labels[i], force_dtype=labels_dtype # type:ignore
).name(index_col_id),
*values,
ibis_types.literal(i).name(ORDER_ID_COLUMN),
offsets_value,
)
sub_expressions.append(sub_expr)
rotated_table = ibis.union(*sub_expressions)

value_columns = [
rotated_table[value_col_id] for value_col_id, _ in unpivot_columns
]
passthrough_values = [rotated_table[col] for col in passthrough_columns]
return ArrayValue(
session=self._session,
table=rotated_table,
columns=[rotated_table[index_col_id], *value_columns],
columns=[rotated_table[index_col_id], *value_columns, *passthrough_values],
hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]],
ordering=ExpressionOrdering(
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
),
)
Expand Down
32 changes: 32 additions & 0 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,35 @@ def rank(
)

return block.select_columns(rownum_col_ids).with_column_labels(labels)


def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
"""
Drop na entries from block
"""
if how == "any":
filtered_block = block
for column in block.value_columns:
filtered_block, result_id = filtered_block.apply_unary_op(
column, ops.notnull_op
)
filtered_block = filtered_block.filter(result_id)
filtered_block = filtered_block.drop_columns([result_id])
return filtered_block
else: # "all"
filtered_block = block
predicate = None
for column in block.value_columns:
filtered_block, partial_predicate = filtered_block.apply_unary_op(
column, ops.notnull_op
)
if predicate:
filtered_block, predicate = filtered_block.apply_binary_op(
partial_predicate, predicate, ops.or_op
)
else:
predicate = partial_predicate
if predicate:
filtered_block = filtered_block.filter(predicate)
filtered_block = filtered_block.select_columns(block.value_columns)
return filtered_block
Loading