From 0ffa9f065b09c3c26b0a1e74d7dc0460c6e1c8ac Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 2 Jan 2024 21:11:11 +0000 Subject: [PATCH] feat: add 'columns' as an alias for 'col_order' --- bigframes/pandas/__init__.py | 18 +++-- bigframes/session/__init__.py | 66 +++++++++++++------ ...q_dataframes_ml_drug_name_generation.ipynb | 4 +- tests/system/small/test_session.py | 10 +-- .../bigframes_vendored/pandas/io/gbq.py | 11 ++-- 5 files changed, 71 insertions(+), 38 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 7386c4a2e7..76b30e6680 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -486,20 +486,22 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: vendored_pandas_gbq.FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( bigframes.session.Session.read_gbq, query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, filters=filters, use_cache=use_cache, + col_order=col_order, ) @@ -520,18 +522,20 @@ def read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_query, query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, + col_order=col_order, ) @@ -542,18 +546,20 @@ def read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_table, query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, + col_order=col_order, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index fbe900106a..15c262afa7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -232,20 +232,28 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: third_party_pandas_gbq.FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - query_or_table = self._filters_to_query(query_or_table, col_order, filters) + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + + query_or_table = self._filters_to_query(query_or_table, columns, filters) if _is_query(query_or_table): return self._read_gbq_query( query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq", use_cache=use_cache, @@ -257,7 +265,7 @@ def read_gbq( return self._read_gbq_table( query_or_table, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq", use_cache=use_cache, @@ -388,9 +396,10 @@ def read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> dataframe.DataFrame: """Turn a SQL query into a DataFrame. @@ -442,10 +451,17 @@ def read_gbq_query( """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + return self._read_gbq_query( query=query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq_query", use_cache=use_cache, @@ -456,7 +472,7 @@ def _read_gbq_query( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str = "read_gbq_query", use_cache: bool = True, @@ -492,7 +508,7 @@ def _read_gbq_query( return self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_cols, - col_order=col_order, + columns=columns, max_results=max_results, use_cache=use_cache, ) @@ -502,9 +518,10 @@ def read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, use_cache: bool = True, + col_order: Iterable[str] = (), ) -> dataframe.DataFrame: """Turn a BigQuery table into a DataFrame. @@ -521,10 +538,17 @@ def read_gbq_table( """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. + if columns and col_order: + raise ValueError( + "Must specify either columns (preferred) or col_order, not both" + ) + elif col_order: + columns = col_order + return self._read_gbq_table( query=query, index_col=index_col, - col_order=col_order, + columns=columns, max_results=max_results, api_name="read_gbq_table", use_cache=use_cache, @@ -583,7 +607,7 @@ def _read_gbq_table( query: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str, use_cache: bool = True, @@ -602,10 +626,10 @@ def _read_gbq_table( table_ref, api_name=api_name, use_cache=use_cache ) - for key in col_order: + for key in columns: if key not in table_expression.columns: raise ValueError( - f"Column '{key}' of `col_order` not found in this table." + f"Column '{key}' of `columns` not found in this table." ) if isinstance(index_col, str): @@ -619,8 +643,8 @@ def _read_gbq_table( f"Column `{key}` of `index_col` not found in this table." ) - if col_order: - table_expression = table_expression.select([*index_cols, *col_order]) + if columns: + table_expression = table_expression.select([*index_cols, *columns]) # If the index is unique and sortable, then we don't need to generate # an ordering column. @@ -719,7 +743,7 @@ def _read_bigquery_load_job( *, job_config: bigquery.LoadJobConfig, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), ) -> dataframe.DataFrame: if isinstance(index_col, str): index_cols = [index_col] @@ -760,7 +784,7 @@ def _read_bigquery_load_job( return self.read_gbq_table( table_id, index_col=index_col, - col_order=col_order, + columns=columns, ) def read_gbq_model(self, model_name: str): @@ -959,13 +983,13 @@ def read_csv( if index_col is None: index_col = () - # usecols should only be an iterable of strings (column names) for use as col_order in read_gbq. - col_order: Tuple[Any, ...] = tuple() + # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. + columns: Tuple[Any, ...] = tuple() if usecols is not None: if isinstance(usecols, Iterable) and all( isinstance(col, str) for col in usecols ): - col_order = tuple(col for col in usecols) + columns = tuple(col for col in usecols) else: raise NotImplementedError( "BigQuery engine only supports an iterable of strings for `usecols`. " @@ -1000,7 +1024,7 @@ def read_csv( table, job_config=job_config, index_col=index_col, - col_order=col_order, + columns=columns, ) else: if any(arg in kwargs for arg in ("chunksize", "iterator")): diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 56d7bd1355..52a1c4e768 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -613,7 +613,7 @@ "source": [ "# Query 3 columns of interest from drug label dataset\n", "df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", - " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + " columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", "\n", "# Exclude any rows with missing data\n", "df = df.dropna()\n", @@ -825,7 +825,7 @@ "source": [ "# Query 3 columns of interest from drug label dataset\n", "df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", - " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + " columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", "\n", "# Exclude any rows with missing data\n", "df_missing = df_missing.dropna()\n", diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 8ce442376a..2d9c332de1 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -52,7 +52,7 @@ def test_read_gbq_tokyo( @pytest.mark.parametrize( - ("query_or_table", "col_order"), + ("query_or_table", "columns"), [ pytest.param( "{scalars_table_id}", ["bool_col", "int64_col"], id="two_cols_in_table" @@ -79,16 +79,16 @@ def test_read_gbq_tokyo( ), ], ) -def test_read_gbq_w_col_order( +def test_read_gbq_w_columns( session: bigframes.Session, scalars_table_id: str, query_or_table: str, - col_order: List[str], + columns: List[str], ): df = session.read_gbq( - query_or_table.format(scalars_table_id=scalars_table_id), col_order=col_order + query_or_table.format(scalars_table_id=scalars_table_id), columns=columns ) - assert df.columns.tolist() == col_order + assert df.columns.tolist() == columns @pytest.mark.parametrize( diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index dc8bcc1f77..8e2c9f092d 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -17,10 +17,11 @@ def read_gbq( query_or_table: str, *, index_col: Iterable[str] | str = (), - col_order: Iterable[str] = (), + columns: Iterable[str] = (), max_results: Optional[int] = None, filters: FiltersType = (), use_cache: bool = True, + col_order: Iterable[str] = (), ): """Loads a DataFrame from BigQuery. @@ -77,11 +78,11 @@ def read_gbq( Reading data with `columns` and `filters` parameters: - >>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] + >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] >>> df = bpd.read_gbq( ... "bigquery-public-data.baseball.games_wide", - ... col_order=col_order, + ... columns=columns, ... filters=filters, ... ) >>> df.head(1) @@ -97,7 +98,7 @@ def read_gbq( `project.dataset.tablename` or `dataset.tablename`. index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. - col_order (Iterable[str]): + columns (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. max_results (Optional[int], default None): @@ -113,6 +114,8 @@ def read_gbq( is to be conducted. use_cache (bool, default True): Whether to cache the query inputs. Default to True. + col_order (Iterable[str]): + Alias for columns, retained for backwards compatibility. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.