Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 25 additions & 86 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import functools
import itertools
import random
import textwrap
import typing
from typing import (
Iterable,
Expand All @@ -54,16 +53,13 @@
from bigframes.core import agg_expressions, local_data
import bigframes.core as core
import bigframes.core.agg_expressions as ex_types
import bigframes.core.compile.googlesql as googlesql
import bigframes.core.expression as ex
import bigframes.core.expression as scalars
import bigframes.core.guid as guid
import bigframes.core.identifiers
import bigframes.core.join_def as join_defs
import bigframes.core.ordering as ordering
import bigframes.core.pyarrow_utils as pyarrow_utils
import bigframes.core.schema as bf_schema
import bigframes.core.sql as sql
import bigframes.core.utils as utils
import bigframes.core.window_spec as windows
import bigframes.dtypes
Expand Down Expand Up @@ -2776,14 +2772,6 @@ def _throw_if_null_index(self, opname: str):
)

def _get_rows_as_json_values(self) -> Block:
# We want to preserve any ordering currently present before turning to
# direct SQL manipulation. We will restore the ordering when we rebuild
# expression.
# TODO(shobs): Replace direct SQL manipulation by structured expression
# manipulation
expr, ordering_column_name = self.expr.promote_offsets()
expr_sql = self.session._executor.to_sql(expr)

# Names of the columns to serialize for the row.
# We will use the repr-eval pattern to serialize a value here and
# deserialize in the cloud function. Let's make sure that would work.
Expand All @@ -2799,93 +2787,44 @@ def _get_rows_as_json_values(self) -> Block:
)

column_names.append(serialized_column_name)
column_names_csv = sql.csv(map(sql.simple_literal, column_names))

# index columns count
index_columns_count = len(self.index_columns)

# column references to form the array of values for the row
column_types = list(self.index.dtypes) + list(self.dtypes)
column_references = []
for type_, col in zip(column_types, self.expr.column_ids):
if isinstance(type_, pd.ArrowDtype) and pa.types.is_binary(
type_.pyarrow_dtype
):
column_references.append(sql.to_json_string(col))
if type_ == bigframes.dtypes.BYTES_DTYPE:
column_references.append(ops.ToJSONString().as_expr(col))
elif type_ == bigframes.dtypes.BOOL_DTYPE:
# cast operator produces True/False, but function template expects lower case
column_references.append(
ops.lower_op.as_expr(
ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col)
)
)
else:
column_references.append(sql.cast_as_string(col))

column_references_csv = sql.csv(column_references)

# types of the columns to serialize for the row
column_types_csv = sql.csv(
[sql.simple_literal(str(typ)) for typ in column_types]
)
column_references.append(
ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col)
)

# row dtype to use for deserializing the row as pandas series
pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
if pandas_row_dtype is None:
pandas_row_dtype = "object"
pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype))

# create a json column representing row through SQL manipulation
row_json_column_name = guid.generate_guid()
select_columns = (
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
)
select_columns_csv = sql.csv(
[googlesql.identifier(col) for col in select_columns]
)
json_sql = f"""\
With T0 AS (
{textwrap.indent(expr_sql, " ")}
),
T1 AS (
SELECT *,
TO_JSON_STRING(JSON_OBJECT(
"names", [{column_names_csv}],
"types", [{column_types_csv}],
"values", [{column_references_csv}],
"indexlength", {index_columns_count},
"dtype", {pandas_row_dtype}
)) AS {googlesql.identifier(row_json_column_name)} FROM T0
)
SELECT {select_columns_csv} FROM T1
"""
# The only ways this code is used is through df.apply(axis=1) cope path
destination, query_job = self.session._loader._query_to_destination(
json_sql, cluster_candidates=[ordering_column_name]
)
if not destination:
raise ValueError(f"Query job {query_job} did not produce result table")

new_schema = (
self.expr.schema.select([*self.index_columns])
.append(
bf_schema.SchemaItem(
row_json_column_name, bigframes.dtypes.STRING_DTYPE
)
)
.append(
bf_schema.SchemaItem(ordering_column_name, bigframes.dtypes.INT_DTYPE)
)
)
pandas_row_dtype = str(pandas_row_dtype)

dest_table = self.session.bqclient.get_table(destination)
expr = core.ArrayValue.from_table(
dest_table,
schema=new_schema,
session=self.session,
offsets_col=ordering_column_name,
n_rows=dest_table.num_rows,
).drop_columns([ordering_column_name])
block = Block(
expr,
index_columns=self.index_columns,
column_labels=[row_json_column_name],
index_labels=self._index_labels,
struct_op = ops.StructOp(
column_names=("names", "types", "values", "indexlength", "dtype")
)
return block
names_val = ex.const(tuple(column_names))
types_val = ex.const(tuple(map(str, column_types)))
values_val = ops.ToArrayOp().as_expr(*column_references)
indexlength_val = ex.const(len(self.index_columns))
dtype_val = ex.const(str(pandas_row_dtype))
struct_expr = struct_op.as_expr(
names_val, types_val, values_val, indexlength_val, dtype_val
)
block, col_id = self.project_expr(ops.ToJSONString().as_expr(struct_expr))
return block.select_column(col_id)


class BlockIndexProperties:
Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/compile/ibis_compiler/scalar_op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,8 +1301,8 @@ def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON):


@scalar_op_compiler.register_unary_op(ops.ToJSONString)
def to_json_string_op_impl(json_obj: ibis_types.Value):
return to_json_string(json_obj=json_obj)
def to_json_string_op_impl(x: ibis_types.Value):
return to_json_string(value=x)


@scalar_op_compiler.register_unary_op(ops.JSONValue, pass_op=True)
Expand Down Expand Up @@ -2069,9 +2069,9 @@ def json_extract_string_array( # type: ignore[empty-body]

@ibis_udf.scalar.builtin(name="to_json_string")
def to_json_string( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON,
value,
) -> ibis_dtypes.String:
"""Convert JSON to STRING."""
"""Convert value to JSON-formatted string."""


@ibis_udf.scalar.builtin(name="json_value")
Expand Down
4 changes: 0 additions & 4 deletions bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,6 @@ def literal_to_ibis_scalar(
ibis_dtype = bigframes_dtype_to_ibis_dtype(force_dtype) if force_dtype else None

if pd.api.types.is_list_like(literal):
if validate:
raise ValueError(
f"List types can't be stored in BigQuery DataFrames. {constants.FEEDBACK_LINK}"
)
# "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax
return tuple(literal)

Expand Down
3 changes: 1 addition & 2 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,7 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
if pd.api.types.is_list_like(literal):
element_types = [infer_literal_type(i) for i in literal]
common_type = lcd_type(*element_types)
as_arrow = bigframes_dtype_to_arrow_dtype(common_type)
return pd.ArrowDtype(as_arrow)
return list_type(common_type)
if pd.api.types.is_dict_like(literal):
fields = []
for key in literal.keys():
Expand Down
6 changes: 0 additions & 6 deletions bigframes/operations/json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ class ToJSONString(base_ops.UnaryOp):
name: typing.ClassVar[str] = "to_json_string"

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be a valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return dtypes.STRING_DTYPE


Expand Down
2 changes: 1 addition & 1 deletion bigframes/operations/struct_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def output_type(self, *input_types):
@dataclasses.dataclass(frozen=True)
class StructOp(base_ops.NaryOp):
name: typing.ClassVar[str] = "struct"
column_names: tuple[str]
column_names: tuple[str, ...]

def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
num_input_types = len(input_types)
Expand Down
7 changes: 0 additions & 7 deletions tests/unit/core/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,6 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar):
)


def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
with pytest.raises(
ValueError,
):
bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"})


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
Expand Down