From d5981a0f9c37d93127afbca94a413d5505d348cb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 29 Aug 2023 15:20:35 +0000 Subject: [PATCH] chore: sync changes from internal repo feat: support `Series.corr` fix: raise AttributeError for unimplemented pandas methods feat: support `DataFrame.stack` feat: support `np.arcsin`, `np.arccos`, `np.arctan`, `np.sinh`, `np.cosh`, `np.tanh`, `np.arcsinh`, `np.arccosh`, `np.arctanh`, `np.exp` with Series argument fix: align column names with pandas in `DataFrame.agg` results docs: set `options.bigquery.project` in sample code chore: unit test internal `get_standardized_ids` method fix: include survey link in abstract `NotImplementedError` exception messages perf: lazily instantiate client library objects fix: allow (but still not recommended) `ORDER BY` in `read_gbq` input when an `index_col` is defined feat: support `read_json` with `engine=bigquery` for newline-delimited JSON files chore: remove unneeded `types-retry` reference feat: support `np.sin`, `np.cos`, `np.tan`, `np.log`, `np.log10`, `np.sqrt`, `np.abs` with Series argument fix: label temp table creation jobs with `source=bigquery-dataframes-temp` label fix: support spaces in column names in `DataFrame` initializater chore: fix permissions on publish docs script feat: support `df[my_column] = [a python list]` feat: add `components_`, `explained_variance_`, and `explained_variance_ratio_` properties to `bigframes.ml.decomposition.PCA` chore: add execute permissions on publish docs script docs: fix link to GitHub chore: fix docs build fix: check for IAM role on the BigQuery connection when initializing a `remote_function` chore: revert pin to maximum pytest-retry plugin version in tests Change-Id: Ia2bbfdd6920185ae8888597654d6a1baa0bab9ae --- .kokoro/docs/common.cfg | 2 +- .kokoro/publish-docs.sh | 0 README.rst | 9 +- bigframes/constants.py | 2 + bigframes/core/__init__.py | 102 +- bigframes/core/block_transforms.py | 32 + bigframes/core/blocks.py | 159 ++- bigframes/core/groupby/__init__.py | 23 +- bigframes/core/indexes/__init__.py | 3 - bigframes/core/utils.py | 53 + bigframes/dataframe.py | 167 ++- bigframes/ml/core.py | 32 +- bigframes/ml/decomposition.py | 29 + bigframes/ml/sql.py | 10 + bigframes/operations/__init__.py | 163 +++ bigframes/operations/base.py | 5 + bigframes/pandas/__init__.py | 31 +- bigframes/remote_function.py | 109 +- bigframes/series.py | 55 +- bigframes/session.py | 467 ++++-- mypy.ini | 3 + notebooks/dataframes/dataframe.ipynb | 1249 +++++++++++++++-- .../bq_dataframes_ml_linear_regression.ipynb | 6 +- .../getting_started_bq_dataframes.ipynb | 6 +- noxfile.py | 8 +- setup.py | 2 + testing/constraints-3.9.txt | 2 + tests/system/conftest.py | 77 + tests/system/small/ml/conftest.py | 43 +- tests/system/small/ml/test_core.py | 94 ++ tests/system/small/ml/test_decomposition.py | 109 +- tests/system/small/test_dataframe.py | 80 +- tests/system/small/test_dataframe_io.py | 92 +- tests/system/small/test_groupby.py | 21 +- tests/system/small/test_multiindex.py | 36 + tests/system/small/test_numpy.py | 69 + tests/system/small/test_remote_function.py | 12 + tests/system/small/test_series.py | 20 +- tests/system/small/test_session.py | 125 +- tests/unit/core/test_utils.py | 56 + tests/unit/ml/test_sql.py | 15 + .../bigframes_vendored/pandas/core/frame.py | 144 +- .../bigframes_vendored/pandas/core/generic.py | 37 +- .../pandas/core/groupby/__init__.py | 46 +- .../pandas/core/indexes/accessor.py | 23 +- .../pandas/core/indexes/base.py | 8 +- .../pandas/core/indexing.py | 6 +- .../pandas/core/reshape/concat.py | 4 +- .../pandas/core/reshape/tile.py | 4 +- .../bigframes_vendored/pandas/core/series.py | 206 +-- .../pandas/core/strings/accessor.py | 66 +- .../pandas/core/window/rolling.py | 16 +- .../bigframes_vendored/pandas/io/gbq.py | 4 +- .../bigframes_vendored/pandas/io/parquet.py | 4 +- .../pandas/io/parsers/readers.py | 82 +- .../bigframes_vendored/pandas/io/pickle.py | 4 +- .../bigframes_vendored/sklearn/base.py | 6 +- .../sklearn/cluster/_kmeans.py | 11 +- .../sklearn/compose/_column_transformer.py | 5 +- .../sklearn/decomposition/_pca.py | 54 +- .../sklearn/ensemble/_forest.py | 8 +- .../sklearn/linear_model/_base.py | 7 +- .../sklearn/linear_model/_logistic.py | 3 +- .../sklearn/metrics/_classification.py | 12 +- .../sklearn/metrics/_ranking.py | 8 +- .../sklearn/metrics/_regression.py | 4 +- .../bigframes_vendored/sklearn/pipeline.py | 7 +- .../sklearn/preprocessing/_data.py | 5 +- .../sklearn/preprocessing/_encoder.py | 5 +- .../bigframes_vendored/xgboost/sklearn.py | 6 +- 70 files changed, 3599 insertions(+), 774 deletions(-) mode change 100644 => 100755 .kokoro/publish-docs.sh create mode 100644 tests/system/small/test_numpy.py create mode 100644 tests/unit/core/test_utils.py diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index ce84d7ec49..bd73988540 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -20,7 +20,7 @@ env_vars: { } env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "git/bigframes/.kokoro/publish-docs.sh" + value: ".kokoro/publish-docs.sh" } env_vars: { diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst index 6ae3753eed..935c54cc8b 100644 --- a/README.rst +++ b/README.rst @@ -41,6 +41,7 @@ method accepts either a fully-qualified table ID or a SQL query. import bigframes.pandas as bpd + bpd.options.bigquery.project = your_gcp_project_id df1 = bpd.read_gbq("project.dataset.table") df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`") @@ -260,7 +261,7 @@ To view and manage Cloud Functions functions, use the `Functions `_ page and use the project picker to select the project in which you created the function. For easy identification, the names of the functions -created by BigQuery DataFrames are prefixed by ``bigframes-``. +created by BigQuery DataFrames are prefixed by ``bigframes``. **Requirements** @@ -283,7 +284,9 @@ following IAM roles: * BigQuery Data Editor (roles/bigquery.dataEditor) * BigQuery Connection Admin (roles/bigquery.connectionAdmin) * Cloud Functions Developer (roles/cloudfunctions.developer) -* Service Account User (roles/iam.serviceAccountUser) +* Service Account User (roles/iam.serviceAccountUser) on the + `service account ` + ``PROJECT_NUMBER-compute@developer.gserviceaccount.com`` * Storage Object Viewer (roles/storage.objectViewer) * Project IAM Admin (roles/resourcemanager.projectIamAdmin) @@ -330,7 +333,7 @@ Data processing location BigQuery DataFrames is designed for scale, which it achieves by keeping data and processing on the BigQuery service. However, you can bring data into the -memory of your client machine by calling ``.execute()`` on a DataFrame or Series +memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series object. If you choose to do this, the memory limitation of your client machine applies. diff --git a/bigframes/constants.py b/bigframes/constants.py index 3f3f155733..90837c79eb 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -21,3 +21,5 @@ "Share your usecase with the BigQuery DataFrames team at the " "https://bit.ly/bigframes-feedback survey." ) + +ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}" diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9f392ce149..d6509e4c0a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -35,6 +35,7 @@ reencode_order_string, StringEncoding, ) +import bigframes.core.utils as utils import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -562,6 +563,36 @@ def aggregate( ordering=ordering, ) + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> ArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self.to_ibis_expr(ordering_mode="unordered") + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return ArrayValue( + self._session, + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + def project_window_op( self, column_name: str, @@ -852,38 +883,75 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def unpivot_single_row( + def unpivot( self, row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], *, + passthrough_columns: typing.Sequence[str] = (), index_col_id: str = "index", - dtype=pandas.Float64Dtype(), + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), ) -> ArrayValue: - """Unpivot a single row.""" - # TODO: Generalize to multiple row input - table = self.to_ibis_expr(ordering_mode="unordered") + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + table = self.to_ibis_expr(ordering_mode="offset_col") sub_expressions = [] - # TODO: validate all columns are equal length, as well as row labels + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type() + labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type) + row_n = len(row_labels) if not all( len(source_columns) == row_n for _, source_columns in unpivot_columns ): raise ValueError("Columns and row labels must all be same length.") - # Select each column for i in range(row_n): values = [] - for result_col, source_cols in unpivot_columns: - values.append( - ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col) - ) - + for j in range(len(unpivot_columns)): + result_col, source_cols = unpivot_columns[j] + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + if source_cols[i] is not None: + values.append( + ops.AsTypeOp(col_dtype) + ._as_ibis(table[source_cols[i]]) + .name(result_col) + ) + else: + values.append( + bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ).name(result_col) + ) + offsets_value = ( + ((table[ORDER_ID_COLUMN] * row_n) + i) + .cast(ibis_dtypes.int64) + .name(ORDER_ID_COLUMN), + ) sub_expr = table.select( - ibis_types.literal(row_labels[i]).name(index_col_id), + passthrough_columns, + bigframes.dtypes.literal_to_ibis_scalar( + row_labels[i], force_dtype=labels_dtype # type:ignore + ).name(index_col_id), *values, - ibis_types.literal(i).name(ORDER_ID_COLUMN), + offsets_value, ) sub_expressions.append(sub_expr) rotated_table = ibis.union(*sub_expressions) @@ -891,13 +959,15 @@ def unpivot_single_row( value_columns = [ rotated_table[value_col_id] for value_col_id, _ in unpivot_columns ] + passthrough_values = [rotated_table[col] for col in passthrough_columns] return ArrayValue( session=self._session, table=rotated_table, - columns=[rotated_table[index_col_id], *value_columns], + columns=[rotated_table[index_col_id], *value_columns, *passthrough_values], hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]], ordering=ExpressionOrdering( ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), total_ordering_columns=frozenset([ORDER_ID_COLUMN]), ), ) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index b13d7bf2d3..abf8b887d8 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -197,3 +197,35 @@ def rank( ) return block.select_columns(rownum_col_ids).with_column_labels(labels) + + +def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"): + """ + Drop na entries from block + """ + if how == "any": + filtered_block = block + for column in block.value_columns: + filtered_block, result_id = filtered_block.apply_unary_op( + column, ops.notnull_op + ) + filtered_block = filtered_block.filter(result_id) + filtered_block = filtered_block.drop_columns([result_id]) + return filtered_block + else: # "all" + filtered_block = block + predicate = None + for column in block.value_columns: + filtered_block, partial_predicate = filtered_block.apply_unary_op( + column, ops.notnull_op + ) + if predicate: + filtered_block, predicate = filtered_block.apply_binary_op( + partial_predicate, predicate, ops.or_op + ) + else: + predicate = partial_predicate + if predicate: + filtered_block = filtered_block.filter(predicate) + filtered_block = filtered_block.select_columns(block.value_columns) + return filtered_block diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2731990feb..f23a4d0b5c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -80,15 +80,18 @@ def __init__( self, expr: core.ArrayValue, index_columns: Iterable[str], - column_labels: typing.Union[pd.Index, typing.Sequence[Label]], - index_labels: typing.Union[pd.Index, typing.Sequence[Label], None] = None, + column_labels: typing.Union[pd.Index, typing.Iterable[Label]], + index_labels: typing.Union[pd.Index, typing.Iterable[Label], None] = None, ): """Construct a block object, will create default index if no index columns specified.""" - if index_labels and (len(index_labels) != len(list(index_columns))): - raise ValueError( - "'index_columns' and 'index_labels' must have equal length" - ) - if len(list(index_columns)) == 0: + index_columns = list(index_columns) + if index_labels: + index_labels = list(index_labels) + if len(index_labels) != len(index_columns): + raise ValueError( + "'index_columns' and 'index_labels' must have equal length" + ) + if len(index_columns) == 0: expr, new_index_col_id = expr.promote_offsets() index_columns = [new_index_col_id] self._index_columns = tuple(index_columns) @@ -114,6 +117,7 @@ def __init__( self._stats_cache: dict[str, dict[str, typing.Any]] = { col_id: {} for col_id in self.value_columns } + # TODO(kemppeterson) Add a cache for corr to parallel the single-column stats. @property def index(self) -> indexes.IndexValue: @@ -826,9 +830,7 @@ def aggregate_all_and_pivot( dtype=pd.Float64Dtype(), ) -> Block: aggregations = [(col_id, operation, col_id) for col_id in self.value_columns] - result_expr = self.expr.aggregate( - aggregations, dropna=dropna - ).unpivot_single_row( + result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), index_col_id="index", unpivot_columns=[(value_col_id, self.value_columns)], @@ -966,6 +968,26 @@ def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): self._stats_cache[column_id].update(stats_map) return stats_map[stat.name] + def get_corr_stat(self, column_id_left: str, column_id_right: str): + # TODO(kemppeterson): Clean up the column names for DataFrames.corr support + # TODO(kemppeterson): Add a cache here. + corr_aggregations = [ + ( + column_id_left, + column_id_right, + "corr_" + column_id_left + column_id_right, + ) + ] + expr = self.expr.corr_aggregate(corr_aggregations) + expr, offset_index_id = expr.promote_offsets() + block = Block( + expr, + index_columns=[offset_index_id], + column_labels=[a[2] for a in corr_aggregations], + ) + df, _ = block.to_pandas() + return df.loc[0, "corr_" + column_id_left + column_id_right] + def summarize( self, column_ids: typing.Sequence[str], @@ -983,7 +1005,7 @@ def summarize( (col_id, [f"{col_id}-{stat.name}" for stat in stats]) for col_id in column_ids ] - expr = self.expr.aggregate(aggregations).unpivot_single_row( + expr = self.expr.aggregate(aggregations).unpivot( labels, unpivot_columns=columns, index_col_id=label_col_id, @@ -1166,6 +1188,121 @@ def pivot( return result_block.with_column_labels(column_index) + def stack(self): + """Unpivot last column axis level into row axis""" + if isinstance(self.column_labels, pd.MultiIndex): + return self._stack_multi() + else: + return self._stack_mono() + + def _stack_mono(self): + if isinstance(self.column_labels, pd.MultiIndex): + raise ValueError("Expected single level index") + + # These are the values that will be turned into rows + stack_values = self.column_labels.drop_duplicates().sort_values() + + # Get matching columns + unpivot_columns: List[Tuple[str, List[str]]] = [] + dtypes: List[bigframes.dtypes.Dtype] = [] + col_id = guid.generate_guid("unpivot_") + dtype = None + input_columns: Sequence[Optional[str]] = [] + for uvalue in stack_values: + matching_ids = self.label_to_col_id.get(uvalue, []) + input_id = matching_ids[0] if len(matching_ids) > 0 else None + if input_id: + if dtype and dtype != self._column_type(input_id): + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + dtype = self._column_type(input_id) + input_columns.append(input_id) + unpivot_columns.append((col_id, input_columns)) + if dtype: + dtypes.append(dtype or pd.Float64Dtype()) + + added_index_column = col_id = guid.generate_guid() + unpivot_expr = self._expr.unpivot( + row_labels=stack_values, + passthrough_columns=self.index_columns, + unpivot_columns=unpivot_columns, + index_col_id=added_index_column, + dtype=dtypes, + ) + block = Block( + unpivot_expr, + index_columns=[*self.index_columns, added_index_column], + column_labels=[None], + index_labels=[*self._index_labels, self.column_labels.names[-1]], + ) + return block + + def _stack_multi(self): + if not isinstance(self.column_labels, pd.MultiIndex): + raise ValueError("Expected multi-index") + + # These are the values that will be turned into rows + stack_values = ( + self.column_labels.get_level_values(-1).drop_duplicates().sort_values() + ) + + result_col_labels = ( + self.column_labels.droplevel(-1) + .drop_duplicates() + .sort_values() + .dropna(how="all") + ) + + # Get matching columns + unpivot_columns: List[Tuple[str, List[str]]] = [] + dtypes = [] + for val in result_col_labels: + col_id = guid.generate_guid("unpivot_") + dtype = None + input_columns: Sequence[Optional[str]] = [] + for uvalue in stack_values: + # Need to unpack if still a multi-index after dropping 1 level + label_to_match = ( + (val, uvalue) if result_col_labels.nlevels == 1 else (*val, uvalue) + ) + matching_ids = self.label_to_col_id.get(label_to_match, []) + input_id = matching_ids[0] if len(matching_ids) > 0 else None + if input_id: + if dtype and dtype != self._column_type(input_id): + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + dtype = self._column_type(input_id) + input_columns.append(input_id) + # Input column i is the first one that + unpivot_columns.append((col_id, input_columns)) + if dtype: + dtypes.append(dtype or pd.Float64Dtype()) + + added_index_column = col_id = guid.generate_guid() + unpivot_expr = self._expr.unpivot( + row_labels=stack_values, + passthrough_columns=self.index_columns, + unpivot_columns=unpivot_columns, + index_col_id=added_index_column, + dtype=dtypes, + ) + block = Block( + unpivot_expr, + index_columns=[*self.index_columns, added_index_column], + column_labels=result_col_labels, + index_labels=[*self._index_labels, self.column_labels.names[-1]], + ) + return block + + def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: + col_offset = self.value_columns.index(col_id) + dtype = self.dtypes[col_offset] + return dtype + @staticmethod def _create_pivot_column_index( value_labels: Sequence[typing.Hashable], columns_values: pd.Index diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5b217effdd..589c5c251c 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -197,8 +197,11 @@ def _agg_string(self, func: str) -> df.DataFrame: return df.DataFrame(agg_block) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: - aggregations = [] + aggregations: typing.List[typing.Tuple[str, agg_ops.AggregateOp]] = [] column_labels = [] + + want_aggfunc_level = any(utils.is_list_like(aggs) for aggs in func.values()) + for label, funcs_for_id in func.items(): col_id = self._resolve_label(label) func_list = ( @@ -206,16 +209,22 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: ) for f in func_list: aggregations.append((col_id, agg_ops.lookup_agg_func(f))) - column_labels.append((col_id, f)) + column_labels.append(label) agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, as_index=self._as_index, dropna=self._dropna, ) - agg_block = agg_block.with_column_labels( - pd.MultiIndex.from_tuples(column_labels) - ) + if want_aggfunc_level: + agg_block = agg_block.with_column_labels( + utils.combine_indices( + pd.Index(column_labels), + pd.Index(agg[1].name for agg in aggregations), + ) + ) + else: + agg_block = agg_block.with_column_labels(pd.Index(column_labels)) return df.DataFrame(agg_block) def _agg_list(self, func: typing.Sequence) -> df.DataFrame: @@ -234,7 +243,9 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: dropna=self._dropna, ) agg_block = agg_block.with_column_labels( - pd.MultiIndex.from_tuples(column_labels) + pd.MultiIndex.from_tuples( + column_labels, names=[*self._block.column_labels.names, None] + ) ) return df.DataFrame(agg_block) diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index d797c57955..184a9ce262 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -14,10 +14,7 @@ from bigframes.core.indexes.index import Index, IndexValue -INDEX_COLUMN_ID = "bigframes_index_{}" - __all__ = [ "Index", "IndexValue", - "INDEX_COLUMN_ID", ] diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index a330002905..1c0a2a1a81 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -12,10 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import typing +from typing import Hashable, Iterable, List import pandas as pd import typing_extensions +import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common + +UNNAMED_COLUMN_ID = "bigframes_unnamed_column" +UNNAMED_INDEX_ID = "bigframes_unnamed_index" + def get_axis_number(axis: typing.Union[str, int, None]) -> typing.Literal[0, 1]: if axis in {0, "index", "rows", None}: @@ -31,3 +37,50 @@ def is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence def is_dict_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Mapping]: return pd.api.types.is_dict_like(obj) + + +def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: + """Combines indices into multi-index while preserving dtypes, names.""" + multi_index = pd.MultiIndex.from_frame( + pd.concat([index1.to_frame(index=False), index2.to_frame(index=False)], axis=1) + ) + # to_frame will produce numbered default names, we don't want these + multi_index.names = [*index1.names, *index2.names] + return multi_index + + +def get_standardized_ids( + col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = () +) -> tuple[list[str], list[str]]: + """Get stardardized column ids as column_ids_list, index_ids_list. + The standardized_column_id must be valid BQ SQL schema column names, can only be string type and unique. + + Args: + col_labels: column labels + + idx_labels: index labels, optional. If empty, will only return column ids. + + Return: + Tuple of (standardized_column_ids, standardized_index_ids) + """ + col_ids = [ + UNNAMED_COLUMN_ID if col_label is None else str(col_label) + for col_label in col_labels + ] + idx_ids = [ + UNNAMED_INDEX_ID if idx_label is None else str(idx_label) + for idx_label in idx_labels + ] + + ids = idx_ids + col_ids + # Column values will be loaded as null if the column name has spaces. + # https://github.com/googleapis/python-bigquery/issues/1566 + ids = [id.replace(" ", "_") for id in ids] + + ids = typing.cast( + List[str], + vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False), + ) + idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] + + return col_ids, idx_ids diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5fbe5d1f9e..ef443db079 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -56,7 +56,6 @@ import bigframes.series import bigframes.series as bf_series import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame -import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing if typing.TYPE_CHECKING: @@ -291,84 +290,59 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> DataFrame: - return self._apply_to_rows(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(dtype)) def _to_sql_query( - self, always_include_index: bool - ) -> Tuple[str, List[Tuple[str, bool]]]: + self, include_index: bool + ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally - including unnamed index columns. + including index columns. Args: - always_include_index (bool): - whether to include unnamed index columns. If False, only named - indexes are included. + include_index (bool): + whether to include index columns. - Returns: a tuple of (sql_string, index_column_list) - Each entry in the index column list is a tuple of (column_name, named). - If named is false, then the column name exists only in SQL + Returns: + a tuple of (sql_string, index_column_id_list, index_column_label_list). + If include_index is set to False, index_column_id_list and index_column_label_list + return empty lists. """ # Has to be unordered as it is impossible to order the sql without # including metadata columns in selection with ibis. ibis_expr = self._block.expr.to_ibis_expr(ordering_mode="unordered") - column_labels = list(self._block.column_labels) + col_labels, idx_labels = list(self._block.column_labels), list( + self._block.index_labels + ) + old_col_ids, old_idx_ids = list(self._block.value_columns), list( + self._block.index_columns + ) - # TODO(swast): Need to have a better way of controlling when to include - # the index or not. - index_has_names = all([name is not None for name in self.index.names]) - if index_has_names: - column_labels = column_labels + list(self.index.names) - elif always_include_index: - # In this mode include the index even if it is a nameless generated - # column like 'bigframes_index_0' - index_labels = [] - unnamed_index_count = 0 - for index_label in self._block.index_labels: - if isinstance(index_label, str): - index_labels.append(index_label) - else: - index_labels.append( - indexes.INDEX_COLUMN_ID.format(unnamed_index_count), - ) - unnamed_index_count += 1 + if not include_index: + idx_labels, old_idx_ids = [], [] + ibis_expr = ibis_expr.drop(*self._block.index_columns) - column_labels = column_labels + typing.cast( - List[Optional[str]], index_labels - ) + old_ids = old_idx_ids + old_col_ids + + new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + new_ids = new_idx_ids + new_col_ids - column_labels_deduped = typing.cast( - List[str], - vendored_pandas_io_common.dedup_names( - column_labels, is_potential_multiindex=False - ), - ) - column_ids = self._block.value_columns substitutions = {} - for column_id, column_label in zip(column_ids, column_labels_deduped): + for old_id, new_id in zip(old_ids, new_ids): # TODO(swast): Do we need to further escape this, or can we rely on # the BigQuery unicode column name feature? - substitutions[column_id] = column_label - - index_cols: List[Tuple[str, bool]] = [] - first_index_offset = len(self._block.column_labels) - if index_has_names or always_include_index: - for i, index_col in enumerate(self._block.index_columns): - offset = first_index_offset + i - substitutions[index_col] = column_labels_deduped[offset] - index_cols = [ - (label, index_has_names) - for label in column_labels_deduped[first_index_offset:] - ] - else: - ibis_expr = ibis_expr.drop(*self._block.index_columns) + substitutions[old_id] = new_id ibis_expr = ibis_expr.relabel(substitutions) - return typing.cast(str, ibis_expr.compile()), index_cols + return ( + typing.cast(str, ibis_expr.compile()), + new_ids[: len(idx_labels)], + idx_labels, + ) @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - sql, _ = self._to_sql_query(always_include_index=False) + sql, _, _ = self._to_sql_query(include_index=False) return sql @property @@ -469,12 +443,12 @@ def __getattr__(self, key: str): if key in self._block.column_labels: return self.__getitem__(key) elif hasattr(pandas.DataFrame, key): - raise NotImplementedError( + raise AttributeError( textwrap.dedent( f""" - BigQuery DataFrames has not yet implemented an equivalent to - 'pandas.DataFrame.{key}'. {constants.FEEDBACK_LINK} - """ + BigQuery DataFrames has not yet implemented an equivalent to + 'pandas.DataFrame.{key}'. {constants.FEEDBACK_LINK} + """ ) ) else: @@ -872,6 +846,32 @@ def _assign_single_item( copy = self.copy() copy[k] = v(copy) return copy + elif utils.is_list_like(v): + given_rows = len(v) + actual_rows = len(self) + if given_rows != actual_rows: + raise ValueError( + f"Length of values ({given_rows}) does not match length of index ({actual_rows})" + ) + + local_df = bigframes.dataframe.DataFrame( + {k: v}, session=self._get_block().expr._session + ) + # local_df is likely (but not guarunteed) to be cached locally + # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE + + this_expr, this_offsets_col_id = self._get_block()._expr.promote_offsets() + block = blocks.Block( + expr=this_expr, + index_labels=self.index.names, + index_columns=self._block.index_columns, + column_labels=[this_offsets_col_id] + list(self._block.value_columns), + ) # offsets are temporarily the first value column, label set to id + this_df_with_offsets = DataFrame(data=block) + join_result = this_df_with_offsets.join( + other=local_df, on=this_offsets_col_id, how="left" + ) + return join_result.drop(columns=[this_offsets_col_id]) else: return self._assign_scalar(k, v) @@ -1024,13 +1024,7 @@ def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: return DataFrame(self._get_block().add_suffix(suffix, axis)) def dropna(self) -> DataFrame: - block = self._block - for column in self._block.value_columns: - block, result_id = block.apply_unary_op(column, ops.notnull_op) - block = block.filter(result_id) - block = block.drop_columns([result_id]) - - return DataFrame(block) + return DataFrame(block_ops.dropna(self._block, how="any")) def any( self, @@ -1209,6 +1203,14 @@ def pivot( ) return DataFrame(pivot_block) + def stack(self): + # TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack. + # TODO: support 'dropna' param by executing dropna only conditionally + result_block = block_ops.dropna(self._block.stack(), how="all") + if not isinstance(self.columns, pandas.MultiIndex): + return bigframes.series.Series(result_block) + return DataFrame(result_block) + def _drop_non_numeric(self, keep_bool=True) -> DataFrame: types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) if not keep_bool: @@ -1508,15 +1510,15 @@ def _groupby_series( ) def abs(self) -> DataFrame: - return self._apply_to_rows(ops.abs_op) + return self._apply_unary_op(ops.abs_op) def isna(self) -> DataFrame: - return self._apply_to_rows(ops.isnull_op) + return self._apply_unary_op(ops.isnull_op) isnull = isna def notna(self) -> DataFrame: - return self._apply_to_rows(ops.notnull_op) + return self._apply_unary_op(ops.notnull_op) notnull = notna @@ -1736,7 +1738,7 @@ def to_parquet(self, path: str, *, index: bool = True) -> None: _, query_job = self._block.expr._session._start_query(export_data_statement) self._set_internal_query_job(query_job) - def _apply_to_rows(self, operation: ops.UnaryOp): + def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: block = self._block.multi_apply_unary_op(self._block.value_columns, operation) return DataFrame(block) @@ -1813,7 +1815,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # to be applied before passing data to remote function, protecting from bad # inputs causing errors. reprojected_df = DataFrame(self._block._force_reproject()) - return reprojected_df._apply_to_rows( + return reprojected_df._apply_unary_op( ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) ) @@ -1871,6 +1873,25 @@ def _slice( block = self._block.slice(start=start, stop=stop, step=step) return DataFrame(block) + def __array_ufunc__( + self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs + ) -> DataFrame: + """Used to support numpy ufuncs. + See: https://numpy.org/doc/stable/reference/ufuncs.html + """ + if ( + inputs[0] is not self + or method != "__call__" + or len(inputs) > 1 + or len(kwargs) > 0 + ): + return NotImplemented + + if ufunc in ops.NUMPY_TO_OP: + return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + + return NotImplemented + def _set_block(self, block: blocks.Block): self._block = block diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 812bb08dc3..27727c9f81 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -74,20 +74,18 @@ def _apply_sql( string from which to construct the output dataframe. It must include the index columns of the input SQL. """ - source_sql, tagged_index_cols = input_data._to_sql_query( - always_include_index=True + source_sql, index_col_ids, index_labels = input_data._to_sql_query( + include_index=True ) - if len(tagged_index_cols) != 1: + if len(index_col_ids) != 1: raise NotImplementedError( f"Only exactly one index column is supported. {constants.FEEDBACK_LINK}" ) - index_col_name, is_named_index = tagged_index_cols[0] sql = func(source_sql) - df = session.read_gbq(sql, index_col=[index_col_name]) - if not is_named_index: - df.index.name = None + df = session.read_gbq(sql, index_col=index_col_ids) + df.index.names = index_labels return df @@ -150,10 +148,10 @@ def forecast(self) -> bpd.DataFrame: def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema # Note: don't need index as evaluate returns a new table - source_sql, _ = ( - input_data._to_sql_query(always_include_index=False) + source_sql, _, _ = ( + input_data._to_sql_query(include_index=False) if (input_data is not None) - else (None, None) + else (None, None, None) ) sql = ml_sql.ml_evaluate(self.model_name, source_sql) @@ -166,6 +164,20 @@ def centroids(self): return self._session.read_gbq(sql) + def principal_components(self): + assert self._model.model_type == "PCA" + + sql = ml_sql.ml_principal_components(self.model_name) + + return self._session.read_gbq(sql) + + def principal_component_info(self): + assert self._model.model_type == "PCA" + + sql = ml_sql.ml_principal_component_info(self.model_name) + + return self._session.read_gbq(sql) + def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: job_config = bigquery.job.CopyJobConfig() if replace: diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 76b4f9ced6..16106d3a7b 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -70,6 +70,35 @@ def fit( ) return self + @property + def components_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before calling components_.") + + return self._bqml_model.principal_components() + + @property + def explained_variance_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError( + "A model must be fitted before calling explained_variance_." + ) + + return self._bqml_model.principal_component_info()[ + ["principal_component_id", "eigenvalue"] + ].rename(columns={"eigenvalue": "explained_variance"}) + + @property + def explained_variance_ratio_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError( + "A model must be fitted before calling explained_variance_ratio_." + ) + + return self._bqml_model.principal_component_info()[ + ["principal_component_id", "explained_variance_ratio"] + ] + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 80054d40e1..bcd8243582 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -182,3 +182,13 @@ def ml_generate_text_embedding( def ml_forecast(model_name: str) -> str: """Encode ML.FORECAST for BQML""" return f"""SELECT * FROM ML.FORECAST(MODEL `{model_name}`)""" + + +def ml_principal_components(model_name: str) -> str: + """Encode ML.PRINCIPAL_COMPONENTS for BQML""" + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `{model_name}`)""" + + +def ml_principal_component_info(model_name: str) -> str: + """Encode ML.PRINCIPAL_COMPONENT_INFO for BQML""" + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{model_name}`)""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 58f19ea8e7..9305cf1dda 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -32,6 +32,12 @@ _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) _NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) _INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) +_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] TernaryOp = typing.Callable[ @@ -51,11 +57,142 @@ def is_windowed(self): return False +# Trig Functions class AbsOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).abs() +class SinOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).sin() + + +class CosOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).cos() + + +class TanOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).tan() + + +# Inverse trig functions +class ArcsinOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.asin()) + + +class ArccosOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.acos()) + + +class ArctanOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan() + + +# Hyperbolic trig functions +# BQ has these functions, but Ibis doesn't +class SinhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sinh_result = ( + numeric_value.exp() - (numeric_value.negate()).exp() + ) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) + + +class CoshOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + cosh_result = ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, cosh_result) + + +class TanhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) + # Beyond +-20, is effectively just the sign function + domain = numeric_value.abs() < _ibis_num(20) + return (~domain).ifelse(numeric_value.sign(), tanh_result) + + +class ArcsinhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() + return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() + + +class ArccoshOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() + acosh_result = (numeric_value + sqrt_part).ln() + domain = numeric_value >= _ibis_num(1) + return (~domain).ifelse(_NAN, acosh_result) + + +class ArctanhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() < _ibis_num(1) + numerator = numeric_value + _ibis_num(1) + denominator = _ibis_num(1) - numeric_value + ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) + atanh_result = ln_input.ln().div(2) + + out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( + _INF * numeric_value, _NAN + ) + + return (~domain).ifelse(out_of_domain, atanh_result) + + +class SqrtOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value >= _ZERO + return (~domain).ifelse(_NAN, numeric_value.sqrt()) + + +class Log10Op(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.log10()) + + +class LnOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.ln()) + + +class ExpOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, numeric_value.exp()) + + class InvertOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).negate() @@ -484,6 +621,28 @@ def _as_ibis(self, x: ibis_types.Value): year_op = YearOp() capitalize_op = CapitalizeOp() +# Just parameterless unary ops for now +# TODO: Parameter mappings +NUMPY_TO_OP: typing.Final = { + np.sin: SinOp(), + np.cos: CosOp(), + np.tan: TanOp(), + np.arcsin: ArcsinOp(), + np.arccos: ArccosOp(), + np.arctan: ArctanOp(), + np.sinh: SinhOp(), + np.cosh: CoshOp(), + np.tanh: TanhOp(), + np.arcsinh: ArcsinhOp(), + np.arccosh: ArccoshOp(), + np.arctanh: ArctanhOp(), + np.exp: ExpOp(), + np.log: LnOp(), + np.log10: Log10Op(), + np.sqrt: SqrtOp(), + np.abs: AbsOp(), +} + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): @@ -785,3 +944,7 @@ def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 361fdca055..81a5bc4c41 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -167,6 +167,11 @@ def _apply_binary_op( partial_op = ops.BinopPartialRight(op, other) return self._apply_unary_op(partial_op) + def _apply_corr_aggregation(self, other: series.Series) -> float: + (left, right, block) = self._align(other, how="outer") + + return block.get_corr_stat(left, right) + def _align(self, other: series.Series, how="outer") -> tuple[str, str, blocks.Block]: # type: ignore """Aligns the series value with another scalar or series object. Returns new left column id, right column id and joined tabled expression.""" values, block = self._align_n( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index b688c18723..280fce1112 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -148,13 +148,15 @@ def _set_default_session_location_if_possible(query): ): return - bqclient, _, _, _ = bigframes.session._create_cloud_clients( + clients_provider = bigframes.session.ClientsProvider( project=options.bigquery.project, location=options.bigquery.location, use_regional_endpoints=options.bigquery.use_regional_endpoints, credentials=options.bigquery.credentials, ) + bqclient = clients_provider.bqclient + if bigframes.session._is_query(query): job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) options.bigquery.location = job.location @@ -226,6 +228,33 @@ def read_csv( read_csv.__doc__ = inspect.getdoc(bigframes.session.Session.read_csv) +def read_json( + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_json, + path_or_buf=path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + engine=engine, + **kwargs, + ) + + +read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json) + + def read_gbq( query: str, *, diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 7cf74d6311..2a4b919dab 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -34,7 +34,14 @@ import cloudpickle import google.api_core.exceptions -from google.cloud import bigquery, bigquery_connection_v1, functions_v2 +import google.api_core.retry +from google.cloud import ( + bigquery, + bigquery_connection_v1, + functions_v2, + resourcemanager_v3, +) +import google.iam.v1 from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType @@ -152,6 +159,7 @@ def __init__( bq_client, bq_connection_client, bq_connection_id, + cloud_resource_manager_client, ): self._gcp_project_id = gcp_project_id self._cloud_function_region = cloud_function_region @@ -161,6 +169,7 @@ def __init__( self._bq_client = bq_client self._bq_connection_client = bq_connection_client self._bq_connection_id = bq_connection_id + self._cloud_resource_manager_client = cloud_resource_manager_client def create_bq_remote_function( self, input_args, input_types, output_type, endpoint, bq_function_name @@ -175,7 +184,8 @@ def create_bq_remote_function( # raise ValueError("Failed to enable BigQuery Connection API") # If the intended connection does not exist then create it - if self.check_bq_connection_exists(): + service_account_id = self.get_service_account_if_connection_exists() + if service_account_id: logger.info(f"Connector {self._bq_connection_id} already exists") else: connection_name, service_account_id = self.create_bq_connection() @@ -183,21 +193,9 @@ def create_bq_remote_function( f"Created BQ connection {connection_name} with service account id: {service_account_id}" ) - # Set up access on the newly created BQ connection - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - # We would explicitly wait for 60+ seconds for the IAM binding to take effect - command_iam = ( - f"gcloud projects add-iam-policy-binding {self._gcp_project_id}" - + f' --member="serviceAccount:{service_account_id}"' - + ' --role="roles/run.invoker"' - ) - logger.info(f"Setting up IAM binding on the BQ connection: {command_iam}") - _run_system_command(command_iam) - - logger.info( - f"Waiting {self._iam_wait_seconds} seconds for IAM to take effect.." - ) - time.sleep(self._iam_wait_seconds) + # Ensure IAM role on the BQ connection + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + self._ensure_iam_binding(service_account_id, "run.invoker") # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -239,6 +237,53 @@ def get_cloud_function_endpoint(self, name): pass return None + # Introduce retries to accommodate transient errors like etag mismatch, + # which can be caused by concurrent operation on the same resource, and + # manifests with message like: + # google.api_core.exceptions.Aborted: 409 There were concurrent policy + # changes. Please retry the whole read-modify-write with exponential + # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match + # the current policy's ETag '\007\006\003,\3750&\363'. + @google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type( + google.api_core.exceptions.Aborted + ), + initial=10, + maximum=20, + multiplier=2, + timeout=60, + ) + def _ensure_iam_binding(self, service_account: str, role: str): + """Ensure necessary IAM role is configured on a service account.""" + project = f"projects/{self._gcp_project_id}" + service_account = f"serviceAccount:{service_account}" + role = f"roles/{role}" + request = google.iam.v1.iam_policy_pb2.GetIamPolicyRequest(resource=project) + policy = self._cloud_resource_manager_client.get_iam_policy(request=request) + + # Check if the binding already exists, and if does, do nothing more + for binding in policy.bindings: + if binding.role == role: + if service_account in binding.members: + return + + # Create a new binding + new_binding = google.iam.v1.policy_pb2.Binding( + role=role, members=[service_account] + ) + policy.bindings.append(new_binding) + request = google.iam.v1.iam_policy_pb2.SetIamPolicyRequest( + resource=project, policy=policy + ) + self._cloud_resource_manager_client.set_iam_policy(request=request) + + # We would wait for the IAM policy change to take effect + # https://cloud.google.com/iam/docs/access-change-propagation + logger.info( + f"Waiting {self._iam_wait_seconds} seconds for IAM to take effect.." + ) + time.sleep(self._iam_wait_seconds) + def create_bq_connection(self): """Create the BigQuery Connection and returns corresponding service account id.""" client = self._bq_connection_client @@ -253,7 +298,7 @@ def create_bq_connection(self): connection = client.create_connection(request) return connection.name, connection.cloud_resource.service_account_id - def check_bq_connection_exists(self): + def get_service_account_if_connection_exists(self) -> Optional[str]: """Check if the BigQuery Connection exists.""" client = self._bq_connection_client request = bigquery_connection_v1.GetConnectionRequest( @@ -262,12 +307,15 @@ def check_bq_connection_exists(self): ) ) + service_account = None try: - client.get_connection(request=request) - return True + service_account = client.get_connection( + request=request + ).cloud_resource.service_account_id except google.api_core.exceptions.NotFound: pass - return False + + return service_account def generate_udf_code(self, def_, dir): """Generate serialized bytecode using cloudpickle given a udf.""" @@ -624,6 +672,7 @@ def remote_function( bigquery_connection_v1.ConnectionServiceClient ] = None, cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, @@ -688,6 +737,11 @@ def remote_function( Client to use for BigQuery connection operations. If this param is not provided then bigquery connection client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. dataset (str, Optional.): Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this @@ -734,7 +788,17 @@ def remote_function( cloud_functions_client = session.cloudfunctionsclient if not cloud_functions_client: raise ValueError( - "A functions connection client must be provided, either directly or via session. " + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + if session: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " f"{constants.FEEDBACK_LINK}" ) @@ -819,6 +883,7 @@ def wrapper(f): bigquery_client, bigquery_connection_client, bigquery_connection, + resource_manager_client, ) rf_name, cf_name = remote_function_client.provision_bq_remote_function( f, ibis_signature.input_types, ibis_signature.output_type, uniq_suffix diff --git a/bigframes/series.py b/bigframes/series.py index a1da93dee3..49b0a5b1f0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -609,6 +609,39 @@ def round_op(x: ibis_types.Value, y: ibis_types.Value): return self._apply_binary_op(decimals, round_op) + def corr(self, other: Series, method="pearson", min_periods=None) -> float: + """ + Compute the correlation with the other Series. Non-number values are ignored in the + computation. + + Uses the "Pearson" method of correlation. Numbers are converted to float before + calculation, so the result may be unstable. + + Args: + other (Series): + The series with which this is to be correlated. + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + + Returns: + float; Will return NaN if there are fewer than two numeric pairs, either series has a + variance or covariance of zero, or any input value is infinite. + """ + # TODO(kemppeterson): Validate early that both are numeric + # TODO(kemppeterson): Handle partially-numeric columns + if method != "pearson": + raise NotImplementedError( + f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}" + ) + if min_periods: + raise NotImplementedError( + f"min_periods not yet supported. {constants.FEEDBACK_LINK}" + ) + return self._apply_corr_aggregation(other) + def all(self) -> bool: return typing.cast(bool, self._apply_aggregation(agg_ops.all_op)) @@ -851,7 +884,7 @@ def __getitem__(self, indexer): def __getattr__(self, key: str): if hasattr(pandas.Series, key): - raise NotImplementedError( + raise AttributeError( textwrap.dedent( f""" BigQuery DataFrames has not yet implemented an equivalent to @@ -1158,6 +1191,26 @@ def to_string( def to_xarray(self): return self.to_pandas().to_xarray() + def __array_ufunc__( + self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs + ) -> Series: + """Used to support numpy ufuncs. + See: https://numpy.org/doc/stable/reference/ufuncs.html + """ + # Only __call__ supported with zero arguments + if ( + inputs[0] is not self + or method != "__call__" + or len(inputs) > 1 + or len(kwargs) > 0 + ): + return NotImplemented + + if ufunc in ops.NUMPY_TO_OP: + return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + + return NotImplemented + # Keep this at the bottom of the Series class to avoid # confusing type checker by overriding str @property diff --git a/bigframes/session.py b/bigframes/session.py index 3ef5250746..ac2f8fa53a 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -47,6 +47,7 @@ import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 +import google.cloud.resourcemanager_v3 import google.cloud.storage as storage # type: ignore import ibis import ibis.backends.bigquery as ibis_bigquery @@ -69,6 +70,7 @@ import bigframes.core.guid as guid import bigframes.core.io as bigframes_io from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference +import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers from bigframes.remote_function import read_gbq_function as bigframes_rgf @@ -99,6 +101,16 @@ # TODO(swast): Need to connect to regional endpoints when performing remote # functions operations (BQ Connection IAM, Cloud Run / Cloud Functions). +# Also see if resource manager client library supports regional endpoints. + +_VALID_ENCODINGS = { + "UTF-8", + "ISO-8859-1", + "UTF-16BE", + "UTF-16LE", + "UTF-32BE", + "UTF-32LE", +} logger = logging.getLogger(__name__) @@ -112,90 +124,143 @@ def _get_default_credentials_with_project(): return pydata_google_auth.default(scopes=_SCOPES, use_local_webserver=False) -def _create_cloud_clients( - project: Optional[str], - location: Optional[str], - use_regional_endpoints: Optional[bool], - credentials: Optional[google.auth.credentials.Credentials], -) -> typing.Tuple[ - bigquery.Client, - google.cloud.bigquery_connection_v1.ConnectionServiceClient, - google.cloud.bigquery_storage_v1.BigQueryReadClient, - google.cloud.functions_v2.FunctionServiceClient, -]: - """Create and initialize BigQuery client objects.""" - - credentials_project = None - if credentials is None: - credentials, credentials_project = _get_default_credentials_with_project() - - # Prefer the project in this order: - # 1. Project explicitly specified by the user - # 2. Project set in the environment - # 3. Project associated with the default credentials - project = ( - project - or os.getenv(_ENV_DEFAULT_PROJECT) - or typing.cast(Optional[str], credentials_project) - ) +class ClientsProvider: + """Provides client instances necessary to perform cloud operations.""" - if not project: - raise ValueError( - "Project must be set to initialize BigQuery client. " - "Try setting `bigframes.options.bigquery.project` first." + def __init__( + self, + project: Optional[str], + location: Optional[str], + use_regional_endpoints: Optional[bool], + credentials: Optional[google.auth.credentials.Credentials], + ): + credentials_project = None + if credentials is None: + credentials, credentials_project = _get_default_credentials_with_project() + + # Prefer the project in this order: + # 1. Project explicitly specified by the user + # 2. Project set in the environment + # 3. Project associated with the default credentials + project = ( + project + or os.getenv(_ENV_DEFAULT_PROJECT) + or typing.cast(Optional[str], credentials_project) ) - if use_regional_endpoints: - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format(location=location), - ) - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format(location=location) - ) - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYCONNECTION_REGIONAL_ENDPOINT.format(location=location) - ) - else: - bq_options = None - bqstorage_options = None - bqconnection_options = None - - bq_info = google.api_core.client_info.ClientInfo(user_agent=_APPLICATION_NAME) - bqclient = bigquery.Client( - client_info=bq_info, - client_options=bq_options, - credentials=credentials, - project=project, - location=location, - ) + if not project: + raise ValueError( + "Project must be set to initialize BigQuery client. " + "Try setting `bigframes.options.bigquery.project` first." + ) - bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - bqconnectionclient = google.cloud.bigquery_connection_v1.ConnectionServiceClient( - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=credentials, - ) + self._project = project + self._location = location + self._use_regional_endpoints = use_regional_endpoints + self._credentials = credentials - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - bqstorageclient = google.cloud.bigquery_storage_v1.BigQueryReadClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=credentials, - ) + # cloud clients initialized for lazy load + self._bqclient = None + self._bqconnectionclient = None + self._bqstorageclient = None + self._cloudfunctionsclient = None + self._resourcemanagerclient = None - functions_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - cloudfunctionsclient = google.cloud.functions_v2.FunctionServiceClient( - client_info=functions_info, - credentials=credentials, - ) + @property + def bqclient(self): + if not self._bqclient: + bq_options = None + if self._use_regional_endpoints: + bq_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format( + location=self._location + ), + ) + bq_info = google.api_core.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqclient = bigquery.Client( + client_info=bq_info, + client_options=bq_options, + credentials=self._credentials, + project=self._project, + location=self._location, + ) - return bqclient, bqconnectionclient, bqstorageclient, cloudfunctionsclient + return self._bqclient + + @property + def bqconnectionclient(self): + if not self._bqconnectionclient: + bqconnection_options = None + if self._use_regional_endpoints: + bqconnection_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYCONNECTION_REGIONAL_ENDPOINT.format( + location=self._location + ) + ) + bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqconnectionclient = ( + google.cloud.bigquery_connection_v1.ConnectionServiceClient( + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, + ) + ) + + return self._bqconnectionclient + + @property + def bqstorageclient(self): + if not self._bqstorageclient: + bqstorage_options = None + if self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) + ) + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqstorageclient = google.cloud.bigquery_storage_v1.BigQueryReadClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) + + return self._bqstorageclient + + @property + def cloudfunctionsclient(self): + if not self._cloudfunctionsclient: + functions_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._cloudfunctionsclient = ( + google.cloud.functions_v2.FunctionServiceClient( + client_info=functions_info, + credentials=self._credentials, + ) + ) + + return self._cloudfunctionsclient + + @property + def resourcemanagerclient(self): + if not self._resourcemanagerclient: + resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._resourcemanagerclient = ( + google.cloud.resourcemanager_v3.ProjectsClient( + credentials=self._credentials, client_info=resourcemanager_info + ) + ) + + return self._resourcemanagerclient class Session( @@ -221,12 +286,9 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): else: self._location = context.location - ( - self.bqclient, - self.bqconnectionclient, - self.bqstorageclient, - self.cloudfunctionsclient, - ) = _create_cloud_clients( + # Instantiate a clients provider to help with cloud clients that will be + # used in the future operations in the session + self._clients_provider = ClientsProvider( project=context.project, location=self._location, use_regional_endpoints=context.use_regional_endpoints, @@ -249,6 +311,26 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): # changed. context._session_started = True + @property + def bqclient(self): + return self._clients_provider.bqclient + + @property + def bqconnectionclient(self): + return self._clients_provider.bqconnectionclient + + @property + def bqstorageclient(self): + return self._clients_provider.bqstorageclient + + @property + def cloudfunctionsclient(self): + return self._clients_provider.cloudfunctionsclient + + @property + def resourcemanagerclient(self): + return self._clients_provider.resourcemanagerclient + @property def _session_dataset_id(self): """A dataset for storing temporary objects local to the session @@ -343,6 +425,38 @@ def read_gbq( max_results=max_results, ) + def _query_to_destination( + self, query: str, index_cols: List[str] + ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: + # If there are no index columns, then there's no reason to cache to a + # (clustered) session table, as we'll just have to query it again to + # create a default index & ordering. + if not index_cols: + _, query_job = self._start_query(query) + return query_job.destination, query_job + + # If a dry_run indicates this is not a query type job, then don't + # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. + dry_run_config = bigquery.QueryJobConfig() + dry_run_config.dry_run = True + _, dry_run_job = self._start_query(query, job_config=dry_run_config) + if dry_run_job.statement_type != "SELECT": + _, query_job = self._start_query(query) + return query_job.destination, query_job + + # Make sure we cluster by the index column(s) so that subsequent + # operations are as speedy as they can be. + try: + ibis_expr = self.ibis_client.sql(query) + return self._ibis_to_session_table(ibis_expr, index_cols), None + except google.api_core.exceptions.BadRequest: + # Some SELECT statements still aren't compatible with CREATE TEMP + # TABLE ... AS SELECT ... statements. For example, if the query has + # a top-level ORDER BY, this conflicts with our ability to cluster + # the table by the index column(s). + _, query_job = self._start_query(query) + return query_job.destination, query_job + def read_gbq_query( self, query: str, @@ -368,16 +482,7 @@ def read_gbq_query( else: index_cols = list(index_col) - # Make sure we cluster by the index column so that subsequent - # operations are as speedy as they can be. - if index_cols: - # Since index_cols are specified, assume that we have a normal SQL - # query. DDL or DML not supported. - ibis_expr = self.ibis_client.sql(query) - destination = self._ibis_to_session_table(ibis_expr, index_cols) - else: - _, query_job = self._start_query(query) - destination = query_job.destination + destination, query_job = self._query_to_destination(query, index_cols) # If there was no destination table, that means the query must have # been DDL or DML. Return some job metadata, instead. @@ -385,9 +490,11 @@ def read_gbq_query( return dataframe.DataFrame( data=pandas.DataFrame( { - "statement_type": [query_job.statement_type], - "job_id": [query_job.job_id], - "location": [query_job.location], + "statement_type": [ + query_job.statement_type if query_job else "unknown" + ], + "job_id": [query_job.job_id if query_job else "unknown"], + "location": [query_job.location if query_job else "unknown"], } ), session=self, @@ -551,9 +658,10 @@ def _read_gbq_with_ordering( table_expression: ibis_types.Table, *, col_order: Iterable[str] = (), - index_cols: Sequence[str] = (), - index_labels: Sequence[Optional[str]] = (), - hidden_cols: Sequence[str] = (), + col_labels: Iterable[Optional[str]] = (), + index_cols: Iterable[str] = (), + index_labels: Iterable[Optional[str]] = (), + hidden_cols: Iterable[str] = (), ordering: core.ExpressionOrdering, is_total_ordering: bool = False, ) -> dataframe.DataFrame: @@ -563,9 +671,13 @@ def _read_gbq_with_ordering( table_expression: an ibis table expression to be executed in BigQuery. col_order: - List of BigQuery column names in the desired order for results DataFrame. + List of BigQuery column ids in the desired order for results DataFrame. + col_labels: + List of column labels as the column names. index_cols: - List of column names to use as the index or multi-index. + List of index ids to use as the index or multi-index. + index_labels: + List of index labels as names of index. hidden_cols: Columns that should be hidden. Ordering columns may (not always) be hidden ordering: @@ -574,6 +686,7 @@ def _read_gbq_with_ordering( Returns: A DataFrame representing results of the query or table. """ + index_cols, index_labels = list(index_cols), list(index_labels) if len(index_cols) != len(index_labels): raise ValueError( "Needs same number of index labels are there are index columns. " @@ -597,11 +710,14 @@ def _read_gbq_with_ordering( table_expression, index_cols ) index_col_values = [table_expression[index_id] for index_id in index_cols] + if not col_labels: + col_labels = column_keys return self._read_ibis( table_expression, index_col_values, index_labels, column_keys, + col_labels, ordering=ordering, ) @@ -650,9 +766,10 @@ def _read_bigquery_load_job( def _read_ibis( self, table_expression: ibis_types.Table, - index_cols: Sequence[ibis_types.Value], - index_labels: Sequence[Optional[str]], - column_keys: Sequence[str], + index_cols: Iterable[ibis_types.Value], + index_labels: Iterable[blocks.Label], + column_keys: Iterable[str], + column_labels: Iterable[blocks.Label], ordering: core.ExpressionOrdering, ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" @@ -674,7 +791,7 @@ def _read_ibis( self, table_expression, columns, hidden_ordering_columns, ordering ), index_columns=[index_col.get_name() for index_col in index_cols], - column_labels=column_keys, + column_labels=column_labels, index_labels=index_labels, ) @@ -713,15 +830,23 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame Returns: bigframes.dataframe.DataFrame: The BigQuery DataFrame. """ + col_labels, idx_labels = ( + pandas_dataframe.columns.to_list(), + pandas_dataframe.index.names, + ) + new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + # Add order column to pandas DataFrame to preserve order in BigQuery ordering_col = "rowid" - columns = frozenset(pandas_dataframe.columns) + columns = frozenset(col_labels + idx_labels) suffix = 2 while ordering_col in columns: ordering_col = f"rowid_{suffix}" suffix += 1 pandas_dataframe_copy = pandas_dataframe.copy() + pandas_dataframe_copy.index.names = new_idx_ids + pandas_dataframe_copy.columns = pandas.Index(new_col_ids) pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) # Specify the datetime dtypes, which is auto-detected as timestamp types. @@ -732,27 +857,12 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame bigquery.SchemaField(column, bigquery.enums.SqlTypeNames.DATETIME) ) - # Unnamed are not copied to BigQuery when load_table_from_dataframe - # executes. - index_cols = list( - filter(lambda name: name is not None, pandas_dataframe_copy.index.names) - ) - index_labels = typing.cast(List[Optional[str]], index_cols) - # Clustering probably not needed anyways as pandas tables are small cluster_cols = [ordering_col] - if len(index_cols) == 0: - # Block constructor will implicitly build default index - pass - job_config = bigquery.LoadJobConfig(schema=schema) job_config.clustering_fields = cluster_cols - # TODO(swast): Rename the unnamed index columns and restore them after - # the load job completes. - # Column values will be loaded as null if the column name has spaces. - # https://github.com/googleapis/python-bigquery/issues/1566 load_table_destination = self._create_session_table() load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, @@ -770,14 +880,22 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame f"SELECT * FROM `{load_table_destination.table_id}`" ) - return self._read_gbq_with_ordering( + # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. + if any( + [new_idx_id not in table_expression.columns for new_idx_id in new_idx_ids] + ): + new_idx_ids, idx_labels = [], [] + + df = self._read_gbq_with_ordering( table_expression=table_expression, - index_cols=index_cols, - index_labels=index_labels, + col_labels=col_labels, + index_cols=new_idx_ids, + index_labels=idx_labels, hidden_cols=(ordering_col,), ordering=ordering, is_total_ordering=True, ) + return df def read_csv( self, @@ -844,10 +962,9 @@ def read_csv( f"{constants.FEEDBACK_LINK}" ) - valid_encodings = {"UTF-8", "ISO-8859-1"} - if encoding is not None and encoding not in valid_encodings: + if encoding is not None and encoding not in _VALID_ENCODINGS: raise NotImplementedError( - f"BigQuery engine only supports the following encodings: {valid_encodings}. " + f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}. " f"{constants.FEEDBACK_LINK}" ) @@ -933,6 +1050,86 @@ def read_parquet( return self._read_bigquery_load_job(path, table, job_config=job_config) + def read_json( + self, + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, + ) -> dataframe.DataFrame: + table = bigquery.Table(self._create_session_table()) + + if engine == "bigquery": + + if dtype is not None: + raise NotImplementedError( + "BigQuery engine does not support the dtype arguments." + ) + + if not lines: + raise NotImplementedError( + "Only newline delimited JSON format is supported." + ) + + if encoding is not None and encoding not in _VALID_ENCODINGS: + raise NotImplementedError( + f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}" + ) + + if lines and orient != "records": + raise ValueError( + "'lines' keyword is only valid when 'orient' is 'records'." + ) + + job_config = bigquery.LoadJobConfig() + job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED + job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON + job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY + job_config.autodetect = True + job_config.encoding = encoding + + return self._read_bigquery_load_job( + path_or_buf, + table, + job_config=job_config, + ) + else: + if any(arg in kwargs for arg in ("chunksize", "iterator")): + raise NotImplementedError( + "'chunksize' and 'iterator' arguments are not supported." + ) + + if isinstance(path_or_buf, str): + self._check_file_size(path_or_buf) + + if engine == "ujson": + pandas_df = pandas.read_json( # type: ignore + path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + **kwargs, + ) + + else: + pandas_df = pandas.read_json( # type: ignore + path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + engine=engine, + **kwargs, + ) + return self.read_pandas(pandas_df) + def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes if filepath.startswith("gs://"): # GCS file path @@ -1008,14 +1205,26 @@ def _query_to_session_table( table = self._create_session_table() cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols) - # TODO(swast): This might not support multi-statement SQL queries. + # TODO(swast): This might not support multi-statement SQL queries (scripts). ddl_text = f""" CREATE TEMP TABLE `_SESSION`.`{table.table_id}` CLUSTER BY {cluster_cols_sql} AS {query_text} """ + + job_config = bigquery.QueryJobConfig() + + # Include a label so that Dataplex Lineage can identify temporary + # tables that BigQuery DataFrames creates. Googlers: See internal issue + # 296779699. We're labeling the job instead of the table because + # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not + # supported`. + job_config.labels = {"source": "bigquery-dataframes-temp"} + try: - self._start_query(ddl_text) # Wait for the job to complete + self._start_query( + ddl_text, job_config=job_config + ) # Wait for the job to complete except google.api_core.exceptions.Conflict: # Allow query retry to succeed. pass diff --git a/mypy.ini b/mypy.ini index ce78c4686e..901394813a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -15,6 +15,9 @@ ignore_missing_imports = True [mypy-google.colab] ignore_missing_imports = True +[mypy-google.iam.*] +ignore_missing_imports = True + [mypy-pytz] ignore_missing_imports = True diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 241c767f57..85ea61d281 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -31,7 +31,22 @@ "execution_count": 2, "id": "96757c59-fc22-420e-a42f-c6cb956110ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "944f0e4417154e81b6496302fe756465", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job ac4d1f2b-e9f3-4d95-b78d-57e40eee93fa is RUNNING. Cubs\n", " 175\n", " \n", + " \n", + " 20\n", + " 71ab82a4-6e07-430a-b695-1af3bc42ea61\n", + " 2016\n", + " Nationals\n", + " Cubs\n", + " 257\n", + " \n", + " \n", + " 21\n", + " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 178\n", + " \n", + " \n", + " 22\n", + " 6d111b57-fa0b-4f24-82df-ff33a26f0252\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 171\n", + " \n", + " \n", + " 23\n", + " a97e9539-bbbd-4e03-bf15-f25ea2c1d923\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 248\n", + " \n", + " \n", + " 24\n", + " dc0c9218-505c-4725-8c0c-40b72cca0956\n", + " 2016\n", + " Astros\n", + " Cubs\n", + " 174\n", + " \n", " \n", "\n", + "

25 rows × 5 columns

\n", "[2431 rows x 5 columns in total]" ], "text/plain": [ @@ -419,6 +503,34 @@ "tags": [] }, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "276760df4c904ced81cbaff3a65d026e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 1943ae42-bcbd-4c2f-914f-209377b5c4d9 is DONE. 0 Bytes processed.
Cubs\n", " 175\n", " \n", + " \n", + " 20\n", + " 71ab82a4-6e07-430a-b695-1af3bc42ea61\n", + " 2016\n", + " Nationals\n", + " Cubs\n", + " 257\n", + " \n", + " \n", + " 21\n", + " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 178\n", + " \n", + " \n", + " 22\n", + " 6d111b57-fa0b-4f24-82df-ff33a26f0252\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 171\n", + " \n", + " \n", + " 23\n", + " a97e9539-bbbd-4e03-bf15-f25ea2c1d923\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 248\n", + " \n", + " \n", + " 24\n", + " dc0c9218-505c-4725-8c0c-40b72cca0956\n", + " 2016\n", + " Astros\n", + " Cubs\n", + " 174\n", + " \n", " \n", "\n", + "

25 rows × 5 columns

\n", "[2431 rows x 5 columns in total]" ], "text/plain": [ @@ -3063,6 +3768,34 @@ "id": "ac3ceabe-4317-453c-9418-826de5094454", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c73064d64afe41cea6738085b273e29d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 08477df8-3e86-4f94-a905-8ac2f36e2b69 is DONE. 0 Bytes processed.
core.BqmlModel: +def penguins_bqml_kmeans_model( + session: bigframes.Session, penguins_kmeans_model_name: str +) -> core.BqmlModel: model = session.bqclient.get_model(penguins_kmeans_model_name) return core.BqmlModel(session, model) +@pytest.fixture(scope="session") +def penguins_bqml_pca_model( + session: bigframes.Session, penguins_pca_model_name: str +) -> core.BqmlModel: + model = session.bqclient.get_model(penguins_pca_model_name) + return core.BqmlModel(session, model) + + @pytest.fixture(scope="session") def penguins_linear_model( session, penguins_linear_model_name: str @@ -140,32 +147,12 @@ def penguins_kmeans_model(session, penguins_kmeans_model_name: str) -> cluster.K @pytest.fixture(scope="session") def penguins_pca_model( - session: bigframes.Session, dataset_id_permanent, penguins_table_id + session: bigframes.Session, penguins_pca_model_name: str ) -> decomposition.PCA: - - # TODO(yunmengxie): Create a shared method to get different types of pretrained models. - sql = f""" -CREATE OR REPLACE MODEL `$model_name` -OPTIONS ( - model_type='pca', - num_principal_components=3 -) AS SELECT - * -FROM `{penguins_table_id}`""" - # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited - model_name = ( - f"{dataset_id_permanent}.penguins_pca_{hashlib.md5(sql.encode()).hexdigest()}" + return cast( + decomposition.PCA, + session.read_gbq_model(penguins_pca_model_name), ) - sql = sql.replace("$model_name", model_name) - - try: - return session.read_gbq_model(model_name) - except google.cloud.exceptions.NotFound: - logging.info( - "penguins_pca_model fixture was not found in the permanent dataset, regenerating it..." - ) - session.bqclient.query(sql).result() - return session.read_gbq_model(model_name) @pytest.fixture(scope="session") diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 4b184b0d4c..6c3e8e06f5 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -140,6 +140,100 @@ def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel): ) +def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel): + result = penguins_bqml_pca_model.principal_components().to_pandas() + assert result.shape == (21, 4) + + # result is too long, only check the first principal component here. + result = result.head(7) + expected = pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + ], + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, + ], + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], + ], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlModel): + result = penguins_bqml_pca_model.principal_component_info().to_pandas() + assert result.shape == (3, 4) + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "eigenvalue": [3.278657, 1.270829, 1.125354], + "explained_variance_ratio": [0.469357, 0.181926, 0.1611], + "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + check_dtype=False, + ) + + def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_df): predictions = penguins_bqml_linear_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 01d5207750..8df4145fcf 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -55,7 +55,7 @@ def test_pca_predict(session, penguins_pca_model: decomposition.PCA): ) -def test_pca_score(session, penguins_pca_model: decomposition.PCA): +def test_pca_score(penguins_pca_model: decomposition.PCA): result = penguins_pca_model.score().to_pandas() expected = pd.DataFrame( {"total_explained_variance_ratio": [0.812383]}, @@ -68,3 +68,110 @@ def test_pca_score(session, penguins_pca_model: decomposition.PCA): rtol=0.1, check_index_type=False, ) + + +def test_pca_components_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.components_.to_pandas() + + # result is too long, only check the first principal component here. + result = result.head(7) + expected = pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + ], + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, + ], + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], + ], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.explained_variance_.to_pandas() + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "explained_variance": [3.278657, 1.270829, 1.125354], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.explained_variance_ratio_.to_pandas() + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "explained_variance_ratio": [0.469357, 0.181926, 0.1611], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5b4f9ebccc..01305adb20 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -77,6 +77,20 @@ def test_df_construct_from_series(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -356,6 +370,52 @@ def test_assign_new_column_w_setitem(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index("string_col") + bf_df = bf_df.set_index("string_col") + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + def test_assign_existing_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"int64_col": 2} @@ -1329,6 +1389,21 @@ def test_df_describe(scalars_dfs): ).all() +def test_df_stack(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas() + pd_result = scalars_pandas_df[columns].stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("values", "index", "columns"), [ @@ -1734,8 +1809,9 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index): ) -def test_getattr_not_implemented(scalars_df_index): - with pytest.raises(NotImplementedError): +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): scalars_df_index.asof() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 1f5aa906c8..3886b85f40 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -32,6 +32,7 @@ import bigframes import bigframes.dataframe +import bigframes.pandas as bpd def test_to_pandas_w_correct_dtypes(scalars_df_default_index): @@ -339,51 +340,68 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) +def test_to_sql_query_unnamed_index_included( + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, +): + bf_df = scalars_df_default_index.reset_index(drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) + assert len(idx_labels) == 1 + assert len(idx_ids) == 1 + assert idx_labels[0] is None + assert idx_ids[0].startswith("bigframes") + + pd_df = scalars_pandas_df_default_index.reset_index(drop=True) + roundtrip = session.read_gbq(sql, index_col=idx_ids) + roundtrip.index.names = [None] + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + + def test_to_sql_query_named_index_included( - session, scalars_df_index, scalars_pandas_df_index + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - sql, index_columns = scalars_df_index._to_sql_query(always_include_index=True) - assert len(index_columns) == 1 - index_column, is_named = index_columns[0] - assert index_column == "rowindex" - assert is_named - - roundtrip = session.read_gbq(sql, index_col=[index_column]) - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_index - ) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) + assert len(idx_labels) == 1 + assert len(idx_ids) == 1 + assert idx_labels[0] == "rowindex_2" + assert idx_ids[0] == "rowindex_2" + + pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) + roundtrip = session.read_gbq(sql, index_col=idx_ids) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( - session, scalars_df_default_index, scalars_pandas_df_default_index + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - # The .sql property should return SQL without the unnamed indexes - sql, index_columns = scalars_df_default_index._to_sql_query( - always_include_index=False - ) - assert len(index_columns) == 0 + bf_df = scalars_df_default_index.reset_index(drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) + assert len(idx_labels) == 0 + assert len(idx_ids) == 0 + pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_default_index - ) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) -def test_to_sql_query_unnamed_index_always_include( - session, - scalars_df_default_index: bigframes.dataframe.DataFrame, - scalars_pandas_df_default_index, +def test_to_sql_query_named_index_excluded( + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - sql, index_columns = scalars_df_default_index._to_sql_query( - always_include_index=True - ) - assert len(index_columns) == 1 - index_column, is_named = index_columns[0] - assert index_column == "bigframes_index_0" - assert not is_named - - roundtrip = session.read_gbq(sql, index_col=[index_column]) - roundtrip.index.name = None - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_default_index - ) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) + assert len(idx_labels) == 0 + assert len(idx_ids) == 0 + + pd_df = scalars_pandas_df_default_index.set_index( + "rowindex_2", drop=True + ).reset_index(drop=True) + roundtrip = session.read_gbq(sql) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 5a2562bfb2..987368ce77 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -112,7 +112,9 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index): +def test_dataframe_groupby_agg_dict_with_list( + scalars_df_index, scalars_pandas_df_index +): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( scalars_df_index[col_names] @@ -129,6 +131,23 @@ def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names] + .groupby("string_col") + .agg({"int64_too": "mean", "string_col": "count"}) + ) + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("string_col") + .agg({"int64_too": "mean", "string_col": "count"}) + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + def test_dataframe_groupby_agg_named(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b2937d7da9..1baf3e6650 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -620,3 +620,39 @@ def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): pd_result = pd_df.cumsum() pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", "a", "b"]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.stack().to_pandas() + pd_result = pd_df.stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", pandas.NA, pandas.NA]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index([pandas.NA, "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.stack().to_pandas() + pd_result = pd_df.stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py new file mode 100644 index 0000000000..fff689caba --- /dev/null +++ b/tests/system/small/test_numpy.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + + +@pytest.mark.parametrize( + ("opname",), + [ + ("sin",), + ("cos",), + ("tan",), + ("arcsin",), + ("arccos",), + ("arctan",), + ("sinh",), + ("cosh",), + ("tanh",), + ("arcsinh",), + ("arccosh",), + ("arctanh",), + ("exp",), + ("log",), + ("log10",), + ("sqrt",), + ("abs",), + ], +) +def test_series_ufuncs(floats_pd, floats_bf, opname): + bf_result = getattr(np, opname)(floats_bf).to_pandas() + pd_result = getattr(np, opname)(floats_pd) + + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("opname",), + [ + ("sin",), + ("cos",), + ("tan",), + ("log",), + ("log10",), + ("sqrt",), + ("abs",), + ], +) +def test_df_ufuncs(scalars_dfs, opname): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = getattr(np, opname)( + scalars_df[["float64_col", "int64_col"]] + ).to_pandas() + pd_result = getattr(np, opname)(scalars_pandas_df[["float64_col", "int64_col"]]) + + pd.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index e40addc4eb..c60d270fca 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -116,6 +116,7 @@ def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection, @@ -126,6 +127,7 @@ def test_remote_function_direct_no_session_param( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. @@ -166,6 +168,7 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection_location, @@ -176,6 +179,7 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. @@ -213,6 +217,7 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, dataset_id_permanent, bq_cf_connection_location_mismatched, ): @@ -224,6 +229,7 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. @@ -238,6 +244,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection_location_project, @@ -248,6 +255,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. @@ -285,6 +293,7 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): @@ -296,6 +305,7 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. @@ -530,6 +540,7 @@ def test_read_gbq_function_like_original( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_df_index, dataset_id_permanent, bq_cf_connection, @@ -541,6 +552,7 @@ def test_read_gbq_function_like_original( bigquery_connection_client=bigqueryconnection_client, dataset=dataset_id_permanent, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 70c56e5e13..88ad2245c9 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -459,6 +459,19 @@ def test_mods(scalars_dfs, col_x, col_y, method): pd.testing.assert_series_equal(pd_result, bf_result) +# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this +# manually with dumb self-correlation instead of parameterized as test_mods is above. +def test_corr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .corr(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + @pytest.mark.parametrize( ("col_x",), [ @@ -900,7 +913,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): pd_result, ) - bf_sql, _ = bf_series.to_frame()._to_sql_query(always_include_index=True) + bf_sql, _, _ = bf_series.to_frame()._to_sql_query(include_index=True) selects = re.findall("SELECT", bf_sql.upper()) assert 0 < len(selects) < (num_joins // 2) @@ -2222,8 +2235,9 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result -def test_getattr_not_implemented(scalars_df_index): - with pytest.raises(NotImplementedError): +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): scalars_df_index.string_col.asof() diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d825c62561..b7bee16ffd 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -111,6 +111,54 @@ def test_read_gbq_w_col_order( ["uuid"], id="unique_uuid_index_query", ), + pytest.param( + """ + SELECT my_index, my_value + FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ) + -- Can't normally cluster tables with ORDER BY clause. + ORDER BY my_index DESC + """, + ["my_index"], + id="unique_index_query_has_order_by", + ), + pytest.param( + """ + WITH my_table AS ( + SELECT * + FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ) + ) + SELECT my_index, my_value FROM my_table + """, + ["my_index"], + id="unique_index_query_with_named_table_expression", + ), + pytest.param( + """ + CREATE TEMP TABLE test_read_gbq_w_index_col_unique_index_query_with_script + AS SELECT * FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ); + SELECT my_index, my_value FROM test_read_gbq_w_index_col_unique_index_query_with_script + """, + ["my_index"], + id="unique_index_query_with_script", + ), pytest.param( "{scalars_table_id}", ["bool_col"], @@ -221,7 +269,7 @@ def test_read_gbq_w_max_results( assert bf_result.shape[0] == max_results -def test_read_gbq_w_script(session, dataset_id: str): +def test_read_gbq_w_script_no_select(session, dataset_id: str): ddl = f""" CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` ( `col_a` INT64, @@ -252,6 +300,20 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_col_label_w_space(session: bigframes.Session): + expected = pd.DataFrame( + { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + ) + result = session.read_pandas(expected).to_pandas() + + pd.testing.assert_frame_equal( + result, expected, check_index_type=False, check_dtype=False + ) + + def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index): df = session.read_pandas(scalars_pandas_df_multi_index) result = df.to_pandas() @@ -755,6 +817,67 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): pd.testing.assert_frame_equal(pd_df_in, pd_df_out) +def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): + scalars_df, _ = scalars_dfs + path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json" + read_path = path.replace("*", FIRST_FILE) + scalars_df.to_json(path, index=False, lines=True, orient="records") + df = session.read_json(read_path, lines=True, orient="records", engine="bigquery") + + # The auto detects of BigQuery load job does not preserve any ordering of columns for json. + pd.testing.assert_index_equal( + df.columns.sort_values(), scalars_df.columns.sort_values() + ) + + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # datetime, numeric and geometry types, so they're skipped here. + df = df.drop(columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"]) + scalars_df = scalars_df.drop( + columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + ) + assert df.shape[0] == scalars_df.shape[0] + pd.testing.assert_series_equal( + df.dtypes.sort_index(), scalars_df.dtypes.sort_index() + ) + + +def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): + scalars_df, _ = scalars_dfs + path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json" + read_path = path.replace("*", FIRST_FILE) + scalars_df.to_json( + path, + index=False, + lines=True, + orient="records", + ) + dtype = scalars_df.dtypes.to_dict() + dtype.pop("geography_col") + + df = session.read_json( + read_path, + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + dtype=dtype, + lines=True, + orient="records", + ) + + assert df._block._expr._ordering is not None + pd.testing.assert_index_equal(df.columns, scalars_df.columns) + + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # numeric and geometry types, so they're skipped here. + df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) + scalars_df = scalars_df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) + + # pandas read_json does not respect the dtype overrides for these columns + df = df.drop(columns=["date_col", "datetime_col", "time_col"]) + scalars_df = scalars_df.drop(columns=["date_col", "datetime_col", "time_col"]) + + assert df.shape[0] == scalars_df.shape[0] + pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + + def test_session_id(session): assert session._session_id is not None diff --git a/tests/unit/core/test_utils.py b/tests/unit/core/test_utils.py new file mode 100644 index 0000000000..fc34f35d9c --- /dev/null +++ b/tests/unit/core/test_utils.py @@ -0,0 +1,56 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core import utils + + +def test_get_standardized_ids_columns(): + col_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] + + col_ids, idx_ids = utils.get_standardized_ids(col_labels) + + assert col_ids == [ + "string", + "0", + utils.UNNAMED_COLUMN_ID, + "duplicate", + "duplicate.1", + "with_space", + ] + assert idx_ids == [] + + +def test_get_standardized_ids_indexes(): + col_labels = ["duplicate"] + idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] + + col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + + assert col_ids == ["duplicate.2"] + assert idx_ids == [ + "string", + "0", + utils.UNNAMED_INDEX_ID, + "duplicate", + "duplicate.1", + "with_space", + ] + + +def test_get_standardized_ids_tuple(): + col_labels = [("foo", 1), ("foo", 2), ("bar", 1)] + + col_ids, _ = utils.get_standardized_ids(col_labels) + + assert col_ids == ["('foo',_1)", "('foo',_2)", "('bar',_1)"] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index d8c8a2d108..c20a17f7d6 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -162,3 +162,18 @@ def test_ml_generate_text_produces_correct_sql(): == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_dataset.my_model`, (SELECT * FROM my_table), STRUCT(value AS item))""" ) + + +def test_ml_principal_components_produces_correct_sql(): + sql = ml_sql.ml_principal_components(model_name="my_dataset.my_model") + assert ( + sql == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_dataset.my_model`)""" + ) + + +def test_ml_principal_component_info_produces_correct_sql(): + sql = ml_sql.ml_principal_component_info(model_name="my_dataset.my_model") + assert ( + sql + == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_dataset.my_model`)""" + ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6762afc61f..5a812dae7e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -15,6 +15,7 @@ import numpy +from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame # ----------------------------------------------------------------------- @@ -33,7 +34,7 @@ class DataFrame(NDFrame): @property def shape(self) -> tuple[int, int]: """Return a tuple representing the dimensionality of the DataFrame.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def axes(self) -> list: @@ -67,7 +68,7 @@ def values(self) -> numpy.ndarray: na_value (default None): The value to use for missing values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # IO methods (to / from other formats) @@ -90,7 +91,7 @@ def to_numpy( Returns: numpy.ndarray: The converted NumPy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_gbq( self, @@ -124,7 +125,7 @@ def to_gbq( If set, write the ordering of the DataFrame as a column in the result table with this name. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_parquet( self, @@ -151,7 +152,7 @@ def to_parquet( Returns: None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Unsorted @@ -179,7 +180,7 @@ def assign(self, **kwargs) -> DataFrame: bigframes.dataframe.DataFrame: A new DataFrame with the new columns in addition to all the existing columns. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Reindexing and alignment @@ -211,7 +212,7 @@ def drop( Raises: KeyError: If any of the labels is not found in the selected axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename( self, @@ -233,7 +234,7 @@ def rename( Raises: KeyError: If any of the labels is not found. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: """ @@ -250,7 +251,7 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Returns: bigframes.dataframe.DataFrame: DataFrame with the new index name """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def set_index( self, @@ -273,7 +274,7 @@ def set_index( Returns: DataFrame: Changed row labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: """ @@ -287,7 +288,7 @@ def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: Returns: DataFrame: DataFrame of rearranged index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def droplevel(self, level): """ @@ -301,7 +302,7 @@ def droplevel(self, level): Returns: DataFrame: DataFrame with requested index / column level(s) removed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reset_index( self, @@ -320,7 +321,7 @@ def reset_index( Returns: bigframes.dataframe.DataFrame: DataFrame with the new index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop_duplicates( self, @@ -347,7 +348,7 @@ def drop_duplicates( Returns: bigframes.dataframe.DataFrame: DataFrame with duplicates removed """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def duplicated(self, subset=None, keep="first"): """ @@ -369,7 +370,7 @@ def duplicated(self, subset=None, keep="first"): Returns: bigframes.series.Series: Boolean series for each duplicated rows. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Reindex-based selection methods @@ -382,7 +383,7 @@ def dropna( Returns: bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Sorting @@ -415,7 +416,7 @@ def sort_values( Returns: DataFrame with sorted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_index( self, @@ -425,7 +426,7 @@ def sort_index( Returns: The original DataFrame sorted by the labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Arithmetic Methods @@ -450,7 +451,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Returns: Result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ne(self, other, axis: str | int = "columns") -> DataFrame: """ @@ -471,7 +472,7 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: Result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def le(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). @@ -497,7 +498,7 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than' of DataFrame and other, element-wise (binary operator `<`). @@ -523,7 +524,7 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ge(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`). @@ -549,7 +550,7 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def gt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than' of DataFrame and other, element-wise (binary operator `>`). @@ -575,7 +576,7 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool: The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add(self, other, axis: str | int = "columns") -> DataFrame: """Get addition of DataFrame and other, element-wise (binary operator `+`). @@ -598,7 +599,7 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -621,7 +622,7 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rsub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -644,7 +645,7 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mul(self, other, axis: str | int = "columns") -> DataFrame: """Get multiplication of DataFrame and other, element-wise (binary operator `*`). @@ -667,7 +668,7 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def truediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -690,7 +691,7 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -713,7 +714,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -736,7 +737,7 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -759,7 +760,7 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -782,7 +783,7 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -805,7 +806,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Data reshaping @@ -847,7 +848,7 @@ def groupby( Returns: bigframes.core.groupby.SeriesGroupBy: A groupby object that contains information about the groups. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Function application @@ -874,7 +875,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Transformed DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Merging / joining methods @@ -902,7 +903,7 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Returns: bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def merge( self, @@ -972,7 +973,7 @@ def merge( Returns: bigframes.dataframe.DataFrame: A DataFrame of the two merged objects. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # ndarray-like stats methods @@ -992,7 +993,7 @@ def any(self, *, bool_only: bool = False): Returns: Series """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all(self, *, bool_only: bool = False): """ @@ -1009,7 +1010,7 @@ def all(self, *, bool_only: bool = False): Returns: bigframes.series.Series: Series if all elements are True. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self, *, numeric_only: bool = False): """ @@ -1022,7 +1023,7 @@ def prod(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the product of the values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min(self, *, numeric_only: bool = False): """Return the minimum of the values over the requested axis. @@ -1037,7 +1038,7 @@ def min(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the minimum of the values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max(self, *, numeric_only: bool = False): """Return the maximum of the values over the requested axis. @@ -1052,7 +1053,7 @@ def max(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series after the maximum of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self, *, numeric_only: bool = False): """Return the sum of the values over the requested axis. @@ -1066,7 +1067,7 @@ def sum(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the sum of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. @@ -1078,7 +1079,7 @@ def mean(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the mean of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, numeric_only: bool = False, exact: bool = False): """Return the median of the values over the requested axis. @@ -1093,7 +1094,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Returns: bigframes.series.Series: Series with the median of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var(self, *, numeric_only: bool = False): """Return unbiased variance over requested axis. @@ -1107,7 +1108,7 @@ def var(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with unbiased variance over requested axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self, *, numeric_only: bool = False): """Return sample standard deviation over requested axis. @@ -1121,7 +1122,7 @@ def std(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with sample standard deviation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self, *, numeric_only: bool = False): """ @@ -1138,7 +1139,7 @@ def count(self, *, numeric_only: bool = False): bigframes.series.Series: For each column/row the number of non-NA/null entries. If `level` is specified returns a `DataFrame`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nunique(self): """ @@ -1147,7 +1148,7 @@ def nunique(self): Returns: bigframes.series.Series: Series with number of distinct elements. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self) -> DataFrame: """Return cumulative minimum over a DataFrame axis. @@ -1157,7 +1158,7 @@ def cummin(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self) -> DataFrame: """Return cumulative maximum over a DataFrame axis. @@ -1167,7 +1168,7 @@ def cummax(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self) -> DataFrame: """Return cumulative sum over a DataFrame axis. @@ -1177,7 +1178,7 @@ def cumsum(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self) -> DataFrame: """Return cumulative product over a DataFrame axis. @@ -1187,7 +1188,7 @@ def cumprod(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative product of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def agg(self, func): """ @@ -1202,7 +1203,7 @@ def agg(self, func): Returns: DataFrame or bigframes.series.Series: Aggregated results. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def describe(self): """ @@ -1227,7 +1228,7 @@ def describe(self): Returns: bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pivot(self, *, columns, index=None, values=None): """ @@ -1263,7 +1264,30 @@ def pivot(self, *, columns, index=None, values=None): Returns: Returns reshaped DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def stack(self): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + .. note:: + BigQuery DataFrames does not support stack operations that would + combine columns of different dtypes. + + Returns: + DataFrame or Series: Stacked dataframe or series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Add index and columns @@ -1280,12 +1304,12 @@ def index(self): Returns: The index labels of the DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def columns(self): "The column labels of the DataFrame." - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( self, @@ -1313,4 +1337,4 @@ def value_counts( Returns: Series: Series containing counts of unique rows in the DataFrame """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 4843c971da..56d3b2434f 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -3,6 +3,7 @@ from typing import Literal, Optional +from bigframes import constants from third_party.bigframes_vendored.pandas.core import indexing @@ -22,7 +23,7 @@ def ndim(self) -> int: Returns: int: Return 1 if Series. Otherwise return 2 if DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def size(self) -> int: @@ -32,7 +33,7 @@ def size(self) -> int: int: Return the number of rows if Series. Otherwise return the number of rows times number of columns if DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ------------------------------------------------------------------------- # Unary Methods @@ -46,7 +47,7 @@ def abs(self): Series/DataFrame containing the absolute value of each element. Returns a Series/DataFrame containing the absolute value of each element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def astype(self, dtype): """ @@ -66,7 +67,7 @@ def astype(self, dtype): same type as caller """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Iteration @@ -85,7 +86,7 @@ def empty(self) -> bool: Returns: bool: If Series/DataFrame is empty, return True, if not return False. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # I/O Methods @@ -155,7 +156,7 @@ def to_json( Returns: None: String output not yet supported. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: """Write object to a comma-separated values (csv) file on Cloud Storage. @@ -177,7 +178,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: Returns: None: String output not yet supported. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Unsorted @@ -215,7 +216,7 @@ def add_prefix(self, prefix: str, axis: int | str | None = None): Returns: New Series or DataFrame with updated labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add_suffix(self, suffix: str, axis: int | str | None = None): """Suffix labels with string `suffix`. @@ -233,7 +234,7 @@ def add_suffix(self, suffix: str, axis: int | str | None = None): Returns: New Series or DataFrame with updated labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def head(self, n: int = 5): """Return the first `n` rows. @@ -254,7 +255,7 @@ def head(self, n: int = 5): Returns: The first `n` rows of the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def tail(self, n: int = 5): """Return the last `n` rows. @@ -275,7 +276,7 @@ def tail(self, n: int = 5): Returns: The last `n` rows of the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sample( self, @@ -301,7 +302,7 @@ def sample( A new object of same type as caller containing `n` items randomly sampled from the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -317,7 +318,7 @@ def dtypes(self): Returns: A *pandas* Series with the data type of each column. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def copy(self): """Make a copy of this object's indices and data. @@ -329,7 +330,7 @@ def copy(self): Returns: Object type matches caller. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Action Methods @@ -346,7 +347,7 @@ def isna(self) -> NDFrame: Mask of bool values for each element that indicates whether an element is an NA value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) isnull = isna @@ -362,7 +363,7 @@ def notna(self) -> NDFrame: NDFrame: Mask of bool values for each element that indicates whether an element is not an NA value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) notnull = notna @@ -381,7 +382,7 @@ def shift( Returns: NDFrame: Copy of input object, shifted. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rank( self, @@ -419,7 +420,7 @@ def rank( Returns: same type as caller: Return a Series or DataFrame with data ranks as values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __nonzero__(self): raise ValueError( diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 95822718c3..9271da8a5e 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -9,6 +9,8 @@ class providing the base-class of operations. """ from __future__ import annotations +from bigframes import constants + class GroupBy: """ @@ -24,7 +26,7 @@ def any(self): where a value is True if any element is True within its respective group, False otherwise. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all(self): """ @@ -35,7 +37,7 @@ def all(self): where a value is True if all elements are True within its respective group, False otherwise. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self): """ @@ -44,7 +46,7 @@ def count(self): Returns: Series or DataFrame: Count of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean( self, @@ -60,7 +62,7 @@ def mean( Returns: pandas.Series or pandas.DataFrame: Mean of groups. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median( self, @@ -81,7 +83,7 @@ def median( Returns: pandas.Series or pandas.DataFrame: Median of groups. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std( self, @@ -100,7 +102,7 @@ def std( Returns: Series or DataFrame: Standard deviation of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var( self, @@ -120,7 +122,7 @@ def var( Series or DataFrame Variance of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum( self, @@ -140,7 +142,7 @@ def sum( Returns: Series or DataFrame: Computed sum of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self, numeric_only: bool = False, min_count: int = 0): """ @@ -156,7 +158,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): Returns: Series or DataFrame: Computed prod of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min( self, @@ -176,7 +178,7 @@ def min( Returns: Series or DataFrame: Computed min of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max( self, @@ -196,7 +198,7 @@ def max( Returns: Series or DataFrame: Computed max of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumcount(self, ascending: bool = True): """ @@ -209,7 +211,7 @@ def cumcount(self, ascending: bool = True): Returns: Series: Sequence number of each element within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self, *args, **kwargs): """ @@ -218,7 +220,7 @@ def cumprod(self, *args, **kwargs): Returns: Series or DataFrame: Cumulative product for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self, *args, **kwargs): """ @@ -227,7 +229,7 @@ def cumsum(self, *args, **kwargs): Returns: Series or DataFrame: Cumulative sum for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self, *args, numeric_only: bool = False, **kwargs): """ @@ -236,7 +238,7 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): Returns: Series or DataFrame: Cumulative min for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self, *args, numeric_only: bool = False, **kwargs): """ @@ -245,7 +247,7 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): Returns: Series or DataFrame: Cumulative max for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def diff(self): """ @@ -256,7 +258,7 @@ def diff(self): Returns: Series or DataFrame: First differences. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def shift(self, periods: int = 1): """ @@ -269,7 +271,7 @@ def shift(self, periods: int = 1): Returns: Series or DataFrame: Object shifted within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rolling(self, *args, **kwargs): """ @@ -289,7 +291,7 @@ def rolling(self, *args, **kwargs): Returns: Series or DataFrame: Return a new grouper with our rolling appended. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def expanding(self, *args, **kwargs): """ @@ -298,7 +300,7 @@ def expanding(self, *args, **kwargs): Returns: Series or DataFrame: A expanding grouper, providing expanding functionality per group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class SeriesGroupBy(GroupBy): @@ -318,7 +320,7 @@ def agg(self, func): Returns: Series or DataFrame """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class DataFrameGroupBy(GroupBy): @@ -347,4 +349,4 @@ def agg(self, func, **kwargs): Returns: DataFrame """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index d59886e8aa..2b4a326317 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -1,3 +1,6 @@ +from bigframes import constants + + class DatetimeProperties: """ Accessor object for datetime-like properties of the Series values. @@ -7,7 +10,7 @@ class DatetimeProperties: def day(self): """The day of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dayofweek(self): @@ -22,7 +25,7 @@ def dayofweek(self): Series or Index: Containing integers indicating the day number. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def date(self): @@ -36,31 +39,31 @@ def date(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def hour(self): """The hours of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def minute(self): """The minutes of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def month(self): """The month as January=1, December=12.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def second(self): """The seconds of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def time(self): @@ -73,7 +76,7 @@ def time(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def quarter(self): @@ -84,10 +87,10 @@ def quarter(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def year(self): """The year of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index ebad5eb918..404a99809c 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py +from bigframes import constants + class Index: """Immutable sequence used for indexing and alignment. @@ -10,14 +12,14 @@ class Index: @property def name(self): """Returns Index name.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def shape(self): """ Return a tuple of the shape of the underlying data. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_numpy(self, dtype): """ @@ -33,4 +35,4 @@ def to_numpy(self, dtype): Returns: numpy.ndarray """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexing.py b/third_party/bigframes_vendored/pandas/core/indexing.py index d5b9f3c079..fae5d6261f 100644 --- a/third_party/bigframes_vendored/pandas/core/indexing.py +++ b/third_party/bigframes_vendored/pandas/core/indexing.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexing.py +from bigframes import constants + class IndexingMixin: """ @@ -32,7 +34,7 @@ def iloc(self): out-of-bounds, except *slice* indexers which allow out-of-bounds indexing (this conforms with python/numpy *slice* semantics). """ - raise NotImplementedError("abstract methdod") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def loc(self): @@ -63,4 +65,4 @@ def loc(self): NotImplementError: if the inputs are not supported. """ - raise NotImplementedError("abstract methdod") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 6a5a9fdde9..6e6d2d8b5c 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -4,6 +4,8 @@ """ from __future__ import annotations +from bigframes import constants + def concat( objs, @@ -135,4 +137,4 @@ def concat( [4 rows x 2 columns] """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 9381ad4552..4f5f2efef0 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -4,6 +4,8 @@ """ from __future__ import annotations +from bigframes import constants + def cut( x, @@ -62,4 +64,4 @@ def cut( are whatever the type in the sequence is. False : returns an ndarray of integers. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 79eb402696..8d505c1ead 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -10,6 +10,7 @@ from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer import pandas.io.formats.format as fmt +from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame if TYPE_CHECKING: @@ -23,31 +24,31 @@ def dt(self): """ Accessor object for datetime-like properties of the Series values. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def index(self): """The index (axis labels) of the Series.""" - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def shape(self): """Return a tuple of the shape of the underlying data.""" - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dtype(self): """ Return the dtype object of the underlying data. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dtypes(self): """ Return the dtype object of the underlying data. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def name(self) -> Hashable: @@ -62,7 +63,7 @@ def name(self) -> Hashable: hashable object: The name of the Series, also the column name if part of a DataFrame. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reset_index( self, @@ -94,13 +95,13 @@ def reset_index( In either case, if ``inplace=True``, no value is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __repr__(self) -> str: """ Return a string representation for a particular Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # IO methods (to / from other formats) @@ -165,7 +166,7 @@ def to_string( result = formatter.to_string() # catch contract violations - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_markdown( self, @@ -188,7 +189,7 @@ def to_markdown( Returns: str: {klass} in Markdown-friendly format. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_dict(self, into: type[dict] = dict) -> Mapping: """ @@ -204,7 +205,7 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: Returns: collections.abc.Mapping: Key-value representation of Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_frame(self) -> DataFrame: """ @@ -213,7 +214,7 @@ def to_frame(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: DataFrame representation of Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_excel(self, excel_writer, sheet_name): """ @@ -235,7 +236,7 @@ def to_excel(self, excel_writer, sheet_name): sheet_name (str, default 'Sheet1'): Name of sheet to contain Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): """ @@ -256,7 +257,7 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): str or None: If buf is None, returns the result as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def tolist(self) -> list: """ @@ -269,7 +270,7 @@ def tolist(self) -> list: Returns: list: list of the values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) to_list = tolist @@ -296,7 +297,7 @@ def to_numpy(self, dtype, copy=False, na_value=None): numpy.ndarray: A NumPy ndarray representing the values in this Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_pickle(self, path, **kwargs): """ @@ -308,7 +309,7 @@ def to_pickle(self, path, **kwargs): object implementing a binary ``write()`` function. File path where the pickled object will be stored. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_xarray(self): """ @@ -319,7 +320,7 @@ def to_xarray(self): converted to Dataset if the object is a DataFrame, or a DataArray if the object is a Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_json( self, @@ -354,7 +355,7 @@ def to_json( None or str: If path_or_buf is None, returns the resulting json format as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: """ @@ -372,7 +373,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: None or str: If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def agg(self, func): """ @@ -387,7 +388,7 @@ def agg(self, func): Returns: scalar or Series: Aggregated results """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self): """ @@ -397,7 +398,7 @@ def count(self): int or Series (if level specified): Number of non-null values in the Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nunique(self) -> int: """ @@ -408,7 +409,7 @@ def nunique(self) -> int: Returns: int: number of unique elements in the object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mode(self) -> Series: """ @@ -421,7 +422,7 @@ def mode(self) -> Series: Returns: bigframes.series.Series: Modes of the Series in sorted order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop_duplicates( self, @@ -442,7 +443,7 @@ def drop_duplicates( Returns: bigframes.series.Series: Series with duplicates dropped or None if ``inplace=True``. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def duplicated(self, keep="first") -> Series: """ @@ -466,7 +467,7 @@ def duplicated(self, keep="first") -> Series: bigframes.series.Series: Series indicating whether each value has occurred in the preceding values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def round(self, decimals: int = 0) -> Series: """ @@ -480,6 +481,29 @@ def round(self, decimals: int = 0) -> Series: Returns: bigframes.series.Series: Rounded values of the Series. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def corr(self, other, method="pearson", min_periods=None) -> float: + """ + Compute the correlation with the other Series. Non-number values are ignored in the + computation. + + Uses the "Pearson" method of correlation. Numbers are converted to float before + calculation, so the result may be unstable. + + Args: + other (Series): + The series with which this is to be correlated. + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + + Returns: + float; Will return NaN if there are fewer than two numeric pairs, either series has a + variance or covariance of zero, or any input value is infinite. + """ raise NotImplementedError("abstract method") def diff(self) -> Series: @@ -497,7 +521,7 @@ def diff(self) -> Series: Returns: {klass}: First differences of the Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def dot(self, other) -> Series | np.ndarray: """ @@ -527,19 +551,19 @@ def dot(self, other) -> Series | np.ndarray: """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __matmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_values( self, @@ -571,7 +595,7 @@ def sort_values( Returns: bigframes.series.Series: Series ordered by values or None if ``inplace=True``. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_index( self, @@ -602,7 +626,7 @@ def sort_index( """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nlargest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" @@ -626,7 +650,7 @@ def nlargest( Returns: bigframes.series.Series: The `n` largest values in the Series, sorted in decreasing order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nsmallest(self, n: int = 5, keep: str = "first") -> Series: """ @@ -649,7 +673,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: Returns: bigframes.series.Series: The `n` smallest values in the Series, sorted in increasing order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # function application @@ -672,7 +696,7 @@ def apply( bigframes.series.Series: If func returns a Series object the result will be a DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def groupby( self, @@ -726,7 +750,7 @@ def groupby( bigframes.core.groupby.SeriesGroupBy: Returns a groupby object that contains information about the groups. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None @@ -758,7 +782,7 @@ def drop( Raises: KeyError: If none of the labels are found in the index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reorder_levels(self, order: Sequence) -> Series: """ @@ -773,7 +797,7 @@ def reorder_levels(self, order: Sequence) -> Series: Returns: type of caller (new object) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def droplevel(self, level): """ @@ -788,7 +812,7 @@ def droplevel(self, level): Returns: Series with requested index / column level(s) removed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fillna( self, @@ -804,7 +828,7 @@ def fillna( Returns: Series or None: Object with missing values filled or None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def between( self, @@ -832,7 +856,7 @@ def between( right (inclusive). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self): """ @@ -844,7 +868,7 @@ def cumprod(self): Returns: bigframes.series.Series: Return cumulative sum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self): """ @@ -861,7 +885,7 @@ def cumsum(self): Returns: scalar or Series: Return cumulative sum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self): """ @@ -878,7 +902,7 @@ def cummax(self): Returns: bigframes.series.Series: Return cumulative maximum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self): """ @@ -901,7 +925,7 @@ def cummin(self): Returns: bigframes.series.Series: Return cumulative minimum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def eq(self, other) -> Series: """Return equal of Series and other, element-wise (binary operator eq). @@ -916,7 +940,7 @@ def eq(self, other) -> Series: Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ne(self, other) -> Series: """Return not equal of Series and other, element-wise (binary operator ne). @@ -931,7 +955,7 @@ def ne(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def le(self, other) -> Series: """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). @@ -946,7 +970,7 @@ def le(self, other) -> Series: bigframes.series.Series. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lt(self, other) -> Series: """Get 'less than' of Series and other, element-wise (binary operator `<`). @@ -961,7 +985,7 @@ def lt(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ge(self, other) -> Series: """Get 'greater than or equal to' of Series and other, element-wise (binary operator `>=`). @@ -976,7 +1000,7 @@ def ge(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def gt(self, other) -> Series: """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). @@ -991,7 +1015,7 @@ def gt(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator add). @@ -1006,7 +1030,7 @@ def add(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def radd(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator radd). @@ -1021,7 +1045,7 @@ def radd(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sub( self, @@ -1039,7 +1063,7 @@ def sub( bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rsub(self, other) -> Series: """Return subtraction of Series and other, element-wise (binary operator rsub). @@ -1054,7 +1078,7 @@ def rsub(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -1069,7 +1093,7 @@ def mul(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -1083,7 +1107,7 @@ def rmul(self, other) -> Series: Returns: Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def truediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator truediv). @@ -1098,7 +1122,7 @@ def truediv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rtruediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator rtruediv). @@ -1113,7 +1137,7 @@ def rtruediv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator floordiv). @@ -1128,7 +1152,7 @@ def floordiv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rfloordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator rfloordiv). @@ -1143,7 +1167,7 @@ def rfloordiv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -1158,7 +1182,7 @@ def mod(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmod(self, other) -> Series: """Get modulo of Series and other, element-wise (binary operator `rmod`). @@ -1173,7 +1197,7 @@ def rmod(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def divmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator divmod). @@ -1188,7 +1212,7 @@ def divmod(self, other) -> Series: consistent with (floordiv, mod) (though pandas may not). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rdivmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator rdivmod). @@ -1203,7 +1227,7 @@ def rdivmod(self, other) -> Series: consistent with (rfloordiv, rmod) (though pandas may not). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all( self, @@ -1218,7 +1242,7 @@ def all( scalar or Series: If level is specified, then, Series is returned; otherwise, scalar is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def any( self, @@ -1233,7 +1257,7 @@ def any( scalar or Series: If level is specified, then, Series is returned; otherwise, scalar is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max( self, @@ -1248,7 +1272,7 @@ def max( Returns: scalar or scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min( self, @@ -1262,7 +1286,7 @@ def min( Returns: scalar or scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std( self, @@ -1277,7 +1301,7 @@ def std( ------- scalar or Series (if level specified) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var( self, @@ -1290,7 +1314,7 @@ def var( Returns: scalar or Series (if level specified) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self): """Return the sum of the values over the requested axis. @@ -1300,7 +1324,7 @@ def sum(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self): """Return the mean of the values over the requested axis. @@ -1308,7 +1332,7 @@ def mean(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, exact: bool = False): """Return the median of the values over the requested axis. @@ -1321,7 +1345,7 @@ def median(self, *, exact: bool = False): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self): """Return the product of the values over the requested axis. @@ -1329,7 +1353,7 @@ def prod(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def skew(self): """Return unbiased skew over requested axis. @@ -1339,7 +1363,7 @@ def skew(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def kurt(self): """Return unbiased kurtosis over requested axis. @@ -1349,7 +1373,7 @@ def kurt(self): Returns: scalar or scalar: Unbiased kurtosis over requested axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def where(self, cond, other): """Replace values where the condition is False. @@ -1373,7 +1397,7 @@ def where(self, cond, other): Returns: bigframes.series.Series: Series after the replacement. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mask(self, cond, other): """Replace values where the condition is True. @@ -1397,7 +1421,7 @@ def mask(self, cond, other): Returns: bigframes.series.Series: Series after the replacement. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def clip(self): """Trim values at input threshold(s). @@ -1416,7 +1440,7 @@ def clip(self): Returns: Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def argmax(self): """ @@ -1427,7 +1451,7 @@ def argmax(self): Returns: Series: Row position of the maximum value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def argmin(self): """ @@ -1438,7 +1462,7 @@ def argmin(self): Returns: Series: Row position of the minimum value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename(self, index, **kwargs) -> Series | None: """ @@ -1461,7 +1485,7 @@ def rename(self, index, **kwargs) -> Series | None: bigframes.series.Series: Series with index labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename_axis(self, mapper, **kwargs): """ @@ -1474,7 +1498,7 @@ def rename_axis(self, mapper, **kwargs): Returns: bigframes.series.Series: Series with the name of the axis set. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rolling( self, @@ -1515,7 +1539,7 @@ def rolling( bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. ``Rolling`` subclass if ``win_type`` is not passed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def expanding(self, min_periods=1): """ @@ -1529,7 +1553,7 @@ def expanding(self, min_periods=1): Returns: bigframes.core.window.Window: ``Expanding`` subclass. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( self, @@ -1560,7 +1584,7 @@ def value_counts( Returns: Series: Series containing counts of unique values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def str(self): @@ -1570,7 +1594,7 @@ def str(self): NAs stay NA unless handled otherwise by a particular method. Patterned after Python’s string methods, with some inspiration from R’s stringr package. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isin(self, values): """ @@ -1595,7 +1619,7 @@ def isin(self, values): Raises: TypeError: If input is not list-like. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def is_monotonic_increasing(self) -> bool: @@ -1605,7 +1629,7 @@ def is_monotonic_increasing(self) -> bool: Returns: bool """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def is_monotonic_decreasing(self) -> bool: @@ -1615,4 +1639,4 @@ def is_monotonic_decreasing(self) -> bool: Returns: bool """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 5e3d0b047f..ecdd9547d5 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -1,6 +1,8 @@ import re import typing +from bigframes import constants + class StringMethods: """ @@ -32,7 +34,7 @@ def extract(self, pat: str, flags: int = 0): expression pat will be used for column names; otherwise capture group numbers will be used. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def find(self, sub, start: int = 0, end=None): """Return lowest indexes in each strings in the Series/Index. @@ -52,7 +54,7 @@ def find(self, sub, start: int = 0, end=None): Returns: bigframes.series.Series: Series with lowest indexes in each strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def len(self): """Compute the length of each element in the Series/Index. @@ -65,7 +67,7 @@ def len(self): the length of each element in the Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lower(self): """Convert strings in the Series/Index to lowercase. @@ -76,7 +78,7 @@ def lower(self): bigframes.series.Series: Series with lowercase. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def slice(self, start=None, stop=None): """Slice substrings from each element in the Series or Index. @@ -94,7 +96,7 @@ def slice(self, start=None, stop=None): substring from original string object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def strip(self): """Remove leading and trailing characters. @@ -109,7 +111,7 @@ def strip(self): and trailing characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def upper(self): """Convert strings in the Series/Index to uppercase. @@ -120,7 +122,7 @@ def upper(self): bigframes.series.Series: Series with uppercase strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isnumeric(self): """Check whether all characters in each string are numeric. @@ -134,7 +136,7 @@ def isnumeric(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isalpha(self): """Check whether all characters in each string are alphabetic. @@ -147,7 +149,7 @@ def isalpha(self): bigframes.series.Series: Series with the same length as the originalSeries/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isdigit(self): """Check whether all characters in each string are digits. @@ -160,7 +162,7 @@ def isdigit(self): bigframes.series.Series: Series with the same length as the originalSeries/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isalnum(self): """Check whether all characters in each string are alphanumeric. @@ -174,7 +176,7 @@ def isalnum(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isspace(self): """Check whether all characters in each string are whitespace. @@ -188,7 +190,7 @@ def isspace(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def islower(self): """Check whether all characters in each string are lowercase. @@ -202,7 +204,7 @@ def islower(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isupper(self): """Check whether all characters in each string are uppercase. @@ -216,7 +218,7 @@ def isupper(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isdecimal(self): """Check whether all characters in each string are decimal. @@ -230,7 +232,7 @@ def isdecimal(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rstrip(self): """Remove trailing characters. @@ -244,7 +246,7 @@ def rstrip(self): bigframes.series.Series: Series without trailing characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lstrip(self): """Remove leading characters. @@ -258,7 +260,7 @@ def lstrip(self): bigframes.series.Series: Series without leading characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def repeat(self, repeats: int): """Duplicate each string in the Series or Index. @@ -272,7 +274,7 @@ def repeat(self, repeats: int): objects specified by input parameter repeats. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def capitalize(self): """Convert strings in the Series/Index to be capitalized. @@ -283,7 +285,7 @@ def capitalize(self): bigframes.series.Series: Series with captitalized strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cat(self, others, *, join): """Concatenate strings in the Series/Index with given separator. @@ -304,7 +306,7 @@ def cat(self, others, *, join): bigframes.series.Series: Series with concatenated strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True): """ @@ -329,7 +331,7 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True whether the given pattern is contained within the string of each element of the Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def replace( self, @@ -373,7 +375,7 @@ def replace( of `pat` replaced by `repl`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def startswith( self, @@ -391,7 +393,7 @@ def startswith( bigframes.series.Series: A Series of booleans indicating whether the given pattern matches the start of each string element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def endswith( self, @@ -409,7 +411,7 @@ def endswith( bigframes.series.Series: A Series of booleans indicating whether the given pattern matches the end of each string element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def match(self, pat: str, case: bool = True, flags: int = 0): """ @@ -426,7 +428,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0): Returns: bigframes.series.Series: Series of boolean values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fullmatch(self, pat: str, case: bool = True, flags: int = 0): """ @@ -443,7 +445,7 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): Returns: bigframes.series.Series: Series of boolean values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def get(self, i: int): """ @@ -459,7 +461,7 @@ def get(self, i: int): Returns: bigframes.series.Series: Series """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pad( self, @@ -482,7 +484,7 @@ def pad( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ljust( self, @@ -502,7 +504,7 @@ def ljust( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rjust( self, @@ -522,7 +524,7 @@ def rjust( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def zfill( self, @@ -544,7 +546,7 @@ def zfill( Returns: bigframes.series.Series: Series of objects. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def center( self, @@ -566,4 +568,4 @@ def center( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/window/rolling.py b/third_party/bigframes_vendored/pandas/core/window/rolling.py index 7a9239b70c..a869c86e72 100644 --- a/third_party/bigframes_vendored/pandas/core/window/rolling.py +++ b/third_party/bigframes_vendored/pandas/core/window/rolling.py @@ -4,34 +4,36 @@ similar to how we have a Groupby object. """ +from bigframes import constants + class Window: """Provide window calculations.""" def count(self): """Calculate the window count of non-NULL observations.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self): """Calculate the weighted window sum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self): """Calculate the weighted window mean.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var(self): """Calculate the weighted window variance.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self): """Calculate the weighted window standard deviation.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max(self): """Calculate the weighted window maximum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min(self): """Calculate the weighted window minimum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 730872034d..95531ff5e8 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -5,6 +5,8 @@ from typing import Iterable, Optional +from bigframes import constants + class GBQIOMixin: def read_gbq( @@ -83,4 +85,4 @@ def read_gbq( Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 6f0a2b3cb4..9aed9af5a8 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -2,6 +2,8 @@ """ parquet compat """ from __future__ import annotations +from bigframes import constants + class ParquetIOMixin: def read_parquet( @@ -22,4 +24,4 @@ def read_parquet( Returns: bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index e01eb734fb..d19a92ecdf 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -6,10 +6,22 @@ """ from __future__ import annotations -from typing import Any, Dict, Literal, MutableSequence, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Dict, + IO, + Literal, + MutableSequence, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np +from bigframes import constants + class ReaderIOMixin: def read_csv( @@ -45,7 +57,8 @@ def read_csv( Args: filepath_or_buffer (str): - a string path including Cloud Storage and local file. + A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` + otherwise passed to pandas.read_csv. sep (Optional[str], default ","): the separator for fields in a CSV file. For the BigQuery engine, the separator can be any ISO-8859-1 single-byte character. To use a character in the range @@ -104,10 +117,71 @@ def read_csv( https://docs.python.org/3/library/codecs.html#standard-encodings The BigQuery engine only supports `UTF-8` and `ISO-8859-1`. **kwargs: - keyword arguments. + keyword arguments for `pandas.read_csv` when not using the BigQuery engine. Returns: bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def read_json( + self, + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, + ): + """ + Convert a JSON string to DataFrame object. + + .. note:: + using `engine="bigquery"` will not guarantee the same ordering as the + file. Instead, set a serialized index column as the index and sort by + that in the resulting DataFrame. + + Args: + path_or_buf (a valid JSON str, path object or file-like object): + A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` + otherwise passed to pandas.read_json. + orient (str, optional): + If `engine="bigquery"` orient only supports "records". + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{{index -> [index], columns -> [columns], data -> [values]}}`` + - ``'records'`` : list like + ``[{{column -> value}}, ... , {{column -> value}}]`` + - ``'index'`` : dict like ``{{index -> {{column -> value}}}}`` + - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}`` + - ``'values'`` : just the values array + + dtype (bool or dict, default None): + If True, infer dtypes; if a dict of column to dtype, then use those; + if False, then don't infer dtypes at all, applies only to the data. + + For all ``orient`` values except ``'table'``, default is True. + encoding (str, default is 'utf-8'): + The encoding to use to decode py3 bytes. + lines (bool, default False): + Read the file as a json object per line. If using `engine="bigquery"` lines only supports True. + engine ({{"ujson", "pyarrow", "bigquery"}}, default "ujson"): + Type of engine to use. If `engine="bigquery"` is specified, then BigQuery's load API will be used. + Otherwise, the engine will be passed to `pandas.read_json`. + **kwargs: + keyword arguments for `pandas.read_json` when not using the BigQuery engine. + + Returns: + bigframes.dataframe.DataFrame: + The DataFrame representing JSON contents. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index a160ef0c4e..71b31956a0 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -9,6 +9,8 @@ StorageOptions, ) +from bigframes import constants + class PickleIOMixin: def read_pickle( @@ -52,4 +54,4 @@ def read_pickle( bigframes.dataframe.DataFrame or bigframes.series.Series: same type as object stored in file. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index fc48cde85b..847ad06f75 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -9,6 +9,8 @@ import inspect from typing import Any, Dict, List +from bigframes import constants + class BaseEstimator: """Base class for all estimators. @@ -94,7 +96,7 @@ def score(self, X, y): Returns: bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RegressorMixin: @@ -120,7 +122,7 @@ def score(self, X, y): Returns: bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class TransformerMixin: diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ea4df0dc02..ff1c04edbe 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -14,6 +14,7 @@ from abc import ABC from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -32,7 +33,7 @@ def predict(self, X): bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the class labels for each sample. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class KMeans(_BaseKMeans): @@ -65,7 +66,7 @@ def fit( Returns: KMeans: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict( self, @@ -80,7 +81,7 @@ def predict( Returns: bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score( self, @@ -98,7 +99,7 @@ def score( Returns: bigframes.dataframe.DataFrame: DataFrame of the metrics. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def cluster_centers_(self): @@ -119,4 +120,4 @@ def cluster_centers_(self): The output contains one row per feature per centroid. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index bc8bc3980a..dead173b2d 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -6,6 +6,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -43,7 +44,7 @@ def fit( Returns: ColumnTransformer: Fitted estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform( self, @@ -58,4 +59,4 @@ def transform( Returns: bigframes.dataframe.DataFrame: Transformed result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 619c13f35d..85feab0024 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -12,6 +12,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -55,7 +56,7 @@ def fit(self, X, y=None): Returns: PCA: Fitted estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): """Return the metrics of the model. @@ -69,7 +70,7 @@ def score(self, X=None, y=None): Returns: bigframes.dataframe.DataFrame: DataFrame that represents model metrics. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict(self, X): """Predict the closest cluster for each sample in X. @@ -80,4 +81,51 @@ def predict(self, X): Returns: bigframes.dataframe.DataFrame: predicted DataFrames.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def components_(self): + """Principal axes in feature space, representing the directions of maximum variance in the data. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns: + principal_component_id: An integer that identifies the principal component. + + feature: The column name that contains the feature. + + numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL. + + categorical_value: An list of mappings containing information about categorical features. Each mapping contains the following fields: + categorical_value.category: The name of each category. + + categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies. + + The output contains one row per feature per component. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def explained_variance_(self): + """The amount of variance explained by each of the selected components. + + Returns: + bigframes.dataframe.DataFrame: DataFrame containing following columns: + principal_component_id: An integer that identifies the principal component. + + explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def explained_variance_ratio_(self): + """Percentage of variance explained by each of the selected components. + + Returns: + bigframes.dataframe.DataFrame: DataFrame containing following columns: + principal_component_id: An integer that identifies the principal component. + + explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all + of the individual principal components. The explained variance ratio by a principal component is + the ratio between the variance, also known as eigenvalue, of that principal component and the total variance. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 73f4684dc3..79224a772d 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -33,6 +33,8 @@ class calls the ``fit`` method of each sub-estimator on random samples from abc import ABCMeta +from bigframes import constants + from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, RegressorMixin @@ -60,7 +62,7 @@ def fit(self, X, y): Returns: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): @@ -82,7 +84,7 @@ def predict(self, X): Returns: The predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RandomForestRegressor(ForestRegressor): @@ -148,7 +150,7 @@ def predict(self, X): Returns: The predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RandomForestClassifier(ForestClassifier): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 65e895298d..8141da4e3b 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -18,6 +18,7 @@ from abc import ABCMeta from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import ( BaseEstimator, ClassifierMixin, @@ -36,7 +37,7 @@ def predict(self, X): Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class LinearClassifierMixin(ClassifierMixin): @@ -52,7 +53,7 @@ def predict(self, X): bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the class labels for each sample. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class LinearRegression(RegressorMixin, LinearModel): @@ -92,4 +93,4 @@ def fit( Returns: LinearRegression: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 8525e57068..a06035eef6 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -14,6 +14,7 @@ from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.linear_model._base import ( BaseEstimator, LinearClassifierMixin, @@ -57,4 +58,4 @@ def fit( Returns: LogisticRegression: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 6d9692ac8d..a9d8038e59 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -20,6 +20,8 @@ # Michal Karbownik # License: BSD 3 clause +from bigframes import constants + def accuracy_score(y_true, y_pred, normalize=True) -> float: """Accuracy classification score. @@ -39,7 +41,7 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: classified samples (float), else returns the number of correctly classified samples (int). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def confusion_matrix( @@ -68,7 +70,7 @@ def confusion_matrix( samples with true label being i-th class and predicted label being j-th class. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def recall_score( @@ -99,7 +101,7 @@ def recall_score( of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def precision_score( @@ -132,7 +134,7 @@ def precision_score( Precision of the positive class in binary classification or weighted average of the precision of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def f1_score( @@ -167,4 +169,4 @@ def f1_score( average of the F1 scores of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 693996070f..ac919edbe3 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +from bigframes import constants + def auc(x, y) -> float: """Compute Area Under the Curve (AUC) using the trapezoidal rule. @@ -35,7 +37,7 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def roc_auc_score(y_true, y_score) -> float: @@ -60,7 +62,7 @@ def roc_auc_score(y_true, y_score) -> float: Returns: float: Area Under the Curve score. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def roc_curve( @@ -95,4 +97,4 @@ def roc_curve( fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index b90c415887..9740c540e9 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -24,6 +24,8 @@ # Ohad Michel # License: BSD 3 clause +from bigframes import constants + def r2_score(y_true, y_pred, force_finite=True) -> float: """:math:`R^2` (coefficient of determination) regression score function. @@ -49,4 +51,4 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: Returns: float: The :math:`R^2` score. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index f8bbae86df..4b8eb25a97 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -11,6 +11,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -47,7 +48,7 @@ def fit( Returns: Pipeline: Pipeline with fitted steps. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X, y): @@ -67,7 +68,7 @@ def score(self, X, y): DataFrame: A DataFrame representing the result of calling `score` on the final estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict(self, X): @@ -81,4 +82,4 @@ def predict(self, X): bigframes.dataframe.DataFrame: A Dataframe representing predicted result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index c57d1f2230..d013043467 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -7,6 +7,7 @@ # Eric Chang # License: BSD 3 clause +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -59,7 +60,7 @@ def fit(self, X): Returns: StandardScaler: Fitted scaler. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform(self, X): """Perform standardization by centering and scaling. @@ -71,4 +72,4 @@ def transform(self, X): Returns: bigframes.dataframe.DataFrame: Transformed result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index a6c32d91c1..b1cf17e539 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -2,6 +2,7 @@ # Joris Van den Bossche # License: BSD 3 clause +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -61,7 +62,7 @@ def fit(self, X): Returns: OneHotEncoder: Fitted encoder. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform(self, X): """Transform X using one-hot encoding. @@ -73,4 +74,4 @@ def transform(self, X): Returns: bigframes.dataframe.DataFrame: The result is categorized as index: number, value: number. Where index is the position of the dict that seeing the category, and value is 0 or 1.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index fcb5d2ec59..620c87fa3d 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -2,6 +2,8 @@ from typing import Any +from bigframes import constants + from ..sklearn.base import BaseEstimator as XGBModelBase from ..sklearn.base import ClassifierMixin as XGBClassifierBase from ..sklearn.base import RegressorMixin as XGBRegressorBase @@ -18,7 +20,7 @@ def predict(self, X): Returns: DataFrame of shape (n_samples,): Returns predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fit(self, X, y): """Fit gradient boosting model. @@ -42,7 +44,7 @@ def fit(self, X, y): Returns: XGBModel: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class XGBClassifierMixIn: