From 7a006b008aff49ede0d043038bdb7db798d2d092 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 13 Nov 2023 14:18:50 +0000 Subject: [PATCH 01/20] feat: Add filters argument to read_gbq for enhanced data querying --- bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 68 +++++++++++++++++++ tests/system/small/test_session.py | 52 ++++++++++++++ .../bigframes_vendored/pandas/io/gbq.py | 10 ++- 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index d35f838366..5b15a1b2de 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -486,6 +486,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + filters: Optional[List[Tuple]] = None, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( @@ -494,6 +495,7 @@ def read_gbq( index_col=index_col, col_order=col_order, max_results=max_results, + filters=filters, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 82c5a1c8d0..81a90c9ede 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -284,9 +284,11 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + filters: Optional[List[Tuple]] = None # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. + query_or_table = self._filters_to_query(query_or_table, filters) if _is_query(query_or_table): return self._read_gbq_query( query_or_table, @@ -307,6 +309,72 @@ def read_gbq( api_name="read_gbq", ) + def _filters_to_query(self, query_or_table, filters): + """Convert filters to query""" + + if (filters is None) or (len(filters) == 0): + return query_or_table + + valid_operators = ["IN", "NOT IN", "=", ">", "<", ">=", "<=", "!="] + + sub_query = ( + f"({query_or_table})" if _is_query(query_or_table) else query_or_table + ) + + where_clause = "" + if filters: + if not isinstance(filters, list): + raise ValueError("Filters should be a list.") + + if not ( + all(isinstance(item, list) for item in filters) + or all(isinstance(item, tuple) for item in filters) + ): + raise ValueError( + "All items in filters should be either all lists or all tuples." + ) + + if all(isinstance(sub_filter, tuple) for sub_filter in filters): + filters = [filters] + + grouped_expressions = [] + for group in filters: + if not isinstance(group, list): + raise ValueError("Each filter group should be a list.") + + group_expressions = [] + for filter_item in group: + if not isinstance(filter_item, tuple): + raise ValueError("Each filter condition should be a tuple.") + + column, operator, value = filter_item + operator = operator.upper() + + if operator not in valid_operators: + raise ValueError(f"Operator {operator} is not valid.") + + if operator in ["IN", "NOT IN"]: + if not isinstance(value, list): + raise ValueError( + f"Value for operator {operator} should be a list." + ) + value_list = ", ".join( + [f'"{v}"' if isinstance(v, str) else str(v) for v in value] + ) + expression = f"{column} {operator} ({value_list})" + else: + value = f'"{value}"' if isinstance(value, str) else value + expression = f"{column} {operator} {value}" + group_expressions.append(expression) + + grouped_expressions.append(" AND ".join(group_expressions)) + + where_clause = " WHERE " + " OR ".join(grouped_expressions) + + full_query = f"SELECT * FROM {sub_query} AS sub{where_clause}" + + return full_query + def _query_to_destination( self, query: str, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index bf72e444eb..a3cd492520 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -309,6 +309,58 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): assert df["statement_type"][0] == "SCRIPT" +@pytest.mark.parametrize( + ("query_or_table", "filters", "validator"), + [ + pytest.param( + """SELECT + rowindex, + string_col, + FROM `{scalars_table_id}` AS t + """, + [("rowindex", "<", 4), ("string_col", "=", "Hello, World!")], + lambda row: row["rowindex"] < 4 and row["string_col"] == "Hello, World!", + id="query_input", + ), + pytest.param( + "{scalars_table_id}", + [("date_col", ">", "2022-10-20")], + lambda row: pd.to_datetime(row["date_col"]) > pd.to_datetime("2022-10-20"), + id="table_input", + ), + pytest.param( + "{scalars_table_id}", + [ + ("rowindex", "not in", [0, 6]), + ("string_col", "in", ["Hello, World!", "こんにちは"]), + ], + lambda row: row["rowindex"] not in [0, 6] + and row["string_col"] in ["Hello, World!", "こんにちは"], + id="or_operation", + ), + pytest.param( + "{scalars_table_id}", + ["date_col", ">", "2022-10-20"], + None, + marks=pytest.mark.xfail( + raises=ValueError, + ), + id="raise_error", + ), + ], +) +def test_read_gbq_with_filters( + session, scalars_table_id: str, query_or_table, filters, validator +): + df = session.read_gbq( + query_or_table.format(scalars_table_id=scalars_table_id), + filters=filters, + ) + + for _, row in df.iterrows(): + assert validator(row) + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 2161310b07..6c9709c1a6 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import Iterable, Optional +from typing import Iterable, List, Optional, Tuple from bigframes import constants @@ -16,6 +16,7 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, + filters: Optional[List[Tuple]] = None, ): """Loads a DataFrame from BigQuery. @@ -83,6 +84,13 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. + filters (List[Tuple], default []): To filter out data. Filter syntax: + [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, + not in] The innermost tuples are transposed into a set of filters + applied through an AND operation. The outer list combines these sets + of filters through an OR operation. A single list of tuples can also + be used, meaning that no OR operation between set of filters is to be + conducted. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. From 37794a3add592359117c0f2ea19d7f4ce108b49e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 13 Nov 2023 14:18:50 +0000 Subject: [PATCH 02/20] feat: Add filters argument to read_gbq for enhanced data querying --- third_party/bigframes_vendored/pandas/io/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 6c9709c1a6..fc0fac0ce5 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, List, Optional, Tuple, Tuple from bigframes import constants @@ -85,7 +85,7 @@ def read_gbq( If set, limit the maximum number of rows to fetch from the query results. filters (List[Tuple], default []): To filter out data. Filter syntax: - [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, + [[(column, op, val), …],…] where op is [=, >, >=, <, <=, !=, in, not in] The innermost tuples are transposed into a set of filters applied through an AND operation. The outer list combines these sets of filters through an OR operation. A single list of tuples can also From 499bdcd8e4da9f86f45110ab041296205a9d0006 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 03/20] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- third_party/bigframes_vendored/pandas/io/gbq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index fc0fac0ce5..0d92787204 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import Iterable, List, List, Optional, Tuple, Tuple +from typing import Iterable, List, Optional, Tuple from bigframes import constants From 300263efff480b8a1ce678f9b3df84492a28e7b0 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 04/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/pandas/__init__.py | 5 +- bigframes/session/__init__.py | 73 ++++++++++--------- tests/system/small/test_session.py | 8 +- .../bigframes_vendored/pandas/io/gbq.py | 36 ++++++--- 4 files changed, 75 insertions(+), 47 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5b15a1b2de..a8615ef8b1 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -59,6 +59,7 @@ import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile +from third_party.bigframes_vendored.pandas.io.gbq import FiltersType # Include method definition so that the method appears in our docs for @@ -486,7 +487,8 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - filters: Optional[List[Tuple]] = None, + columns: Iterable[str] = (), + filters: FiltersType = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( @@ -495,6 +497,7 @@ def read_gbq( index_col=index_col, col_order=col_order, max_results=max_results, + columns=columns, filters=filters, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 81a90c9ede..4baee872c1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -284,11 +284,12 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - filters: Optional[List[Tuple]] = None + columns: Iterable[str] = (), + filters: third_party_pandas_gbq.FiltersType = (), # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - query_or_table = self._filters_to_query(query_or_table, filters) + query_or_table = self._filters_to_query(query_or_table, columns, filters) if _is_query(query_or_table): return self._read_gbq_query( query_or_table, @@ -309,70 +310,70 @@ def read_gbq( api_name="read_gbq", ) - def _filters_to_query(self, query_or_table, filters): + def _filters_to_query(self, query_or_table, columns, filters): """Convert filters to query""" - - if (filters is None) or (len(filters) == 0): + if len(filters) == 0 and len(columns) == 0: return query_or_table - valid_operators = ["IN", "NOT IN", "=", ">", "<", ">=", "<=", "!="] - sub_query = ( f"({query_or_table})" if _is_query(query_or_table) else query_or_table ) + select_clause = "SELECT " + ( + ", ".join(f"`{column}`" for column in columns) if columns else "*" + ) where_clause = "" if filters: - if not isinstance(filters, list): - raise ValueError("Filters should be a list.") - - if not ( - all(isinstance(item, list) for item in filters) - or all(isinstance(item, tuple) for item in filters) + valid_operators = { + "in": "IN", + "not in": "NOT IN", + "==": "=", + ">": ">", + "<": "<", + ">=": ">=", + "<=": "<=", + "!=": "!=", + } + + if ( + isinstance(filters, Iterable) + and isinstance(filters[0], Tuple) + and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple)) ): - raise ValueError( - "All items in filters should be either all lists or all tuples." - ) - - if all(isinstance(sub_filter, tuple) for sub_filter in filters): filters = [filters] - grouped_expressions = [] + or_expressions = [] for group in filters: - if not isinstance(group, list): - raise ValueError("Each filter group should be a list.") + if not isinstance(group, Iterable): + raise ValueError("Each filter group should be a iterable.") - group_expressions = [] + and_expressions = [] for filter_item in group: if not isinstance(filter_item, tuple): raise ValueError("Each filter condition should be a tuple.") column, operator, value = filter_item - operator = operator.upper() if operator not in valid_operators: raise ValueError(f"Operator {operator} is not valid.") + operator = valid_operators[operator] + if operator in ["IN", "NOT IN"]: - if not isinstance(value, list): - raise ValueError( - f"Value for operator {operator} should be a list." - ) value_list = ", ".join( - [f'"{v}"' if isinstance(v, str) else str(v) for v in value] + [repr(v) for v in value] ) - expression = f"{column} {operator} ({value_list})" + expression = f"`{column}` {operator} ({value_list})" else: - value = f'"{value}"' if isinstance(value, str) else value - expression = f"{column} {operator} {value}" - group_expressions.append(expression) - - grouped_expressions.append(" AND ".join(group_expressions)) + expression = f"`{column}` {operator} {repr(value)}" + and_expressions.append(expression) - where_clause = " WHERE " + " OR ".join(grouped_expressions) + or_expressions.append(" AND ".join(and_expressions)) - full_query = f"SELECT * FROM {sub_query} AS sub{where_clause}" + if or_expressions: + where_clause = " WHERE " + " OR ".join(or_expressions) + full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" return full_query def _query_to_destination( diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index a3cd492520..adba686b4a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -318,7 +318,7 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): string_col, FROM `{scalars_table_id}` AS t """, - [("rowindex", "<", 4), ("string_col", "=", "Hello, World!")], + [("rowindex", "<", 4), ("string_col", "==", "Hello, World!")], lambda row: row["rowindex"] < 4 and row["string_col"] == "Hello, World!", id="query_input", ), @@ -361,6 +361,12 @@ def test_read_gbq_with_filters( assert validator(row) +def test_read_gbq_with_columns_filter(session, scalars_table_id: str): + cols = ["int64_too", "string_col", "date_col"] + df = session.read_gbq(scalars_table_id, columns=cols) + assert list(df.columns) == cols + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 0d92787204..2580d6344f 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -3,10 +3,25 @@ from __future__ import annotations -from typing import Iterable, List, Optional, Tuple +from typing import Any, Iterable, Literal, Optional, Tuple, Union from bigframes import constants +FiltersType = ( + Iterable[ + Union[ + Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any], + Iterable[ + Tuple[ + str, + Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], + Any, + ] + ], + ] + ], +) + class GBQIOMixin: def read_gbq( @@ -16,7 +31,8 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - filters: Optional[List[Tuple]] = None, + columns: Iterable[str] = (), + filters: FiltersType = (), ): """Loads a DataFrame from BigQuery. @@ -84,13 +100,15 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. - filters (List[Tuple], default []): To filter out data. Filter syntax: - [[(column, op, val), …],…] where op is [=, >, >=, <, <=, !=, in, - not in] The innermost tuples are transposed into a set of filters - applied through an AND operation. The outer list combines these sets - of filters through an OR operation. A single list of tuples can also - be used, meaning that no OR operation between set of filters is to be - conducted. + columns(Iterable[str], default ()): If not empty, only these columns + will be read from table. + filters (List[Tuple], default ()): To filter out data. Filter syntax: + [[(column, op, val), …],…] where op is [==, >, >=, <, <=, !=, in, + not in] The innermost tuples are transposed into a set of filters + applied through an AND operation. The outer list combines these + sets of filters through an OR operation. A single list of tuples + can also be used, meaning that no OR operation between set of + filters is to be conducted. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. From fdc539da8b5955a6fb6bead7c089905340a8d91f Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 14 Nov 2023 01:48:29 +0000 Subject: [PATCH 05/20] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/session/__init__.py | 4 +--- third_party/bigframes_vendored/pandas/io/gbq.py | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4baee872c1..9d90b70543 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -360,9 +360,7 @@ def _filters_to_query(self, query_or_table, columns, filters): operator = valid_operators[operator] if operator in ["IN", "NOT IN"]: - value_list = ", ".join( - [repr(v) for v in value] - ) + value_list = ", ".join([repr(v) for v in value]) expression = f"`{column}` {operator} ({value_list})" else: expression = f"`{column}` {operator} {repr(value)}" diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 2580d6344f..6ee5cf4166 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -100,14 +100,14 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. - columns(Iterable[str], default ()): If not empty, only these columns + columns(Iterable[str], default ()): If not empty, only these columns will be read from table. filters (List[Tuple], default ()): To filter out data. Filter syntax: [[(column, op, val), …],…] where op is [==, >, >=, <, <=, !=, in, not in] The innermost tuples are transposed into a set of filters - applied through an AND operation. The outer list combines these - sets of filters through an OR operation. A single list of tuples - can also be used, meaning that no OR operation between set of + applied through an AND operation. The outer list combines these + sets of filters through an OR operation. A single list of tuples + can also be used, meaning that no OR operation between set of filters is to be conducted. Returns: From 6ed4194ff8ed44eab8104cf8056afd0877b89181 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 06/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/session/__init__.py | 51 ++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9d90b70543..35a5bdd0d5 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -286,10 +286,13 @@ def read_gbq( max_results: Optional[int] = None, columns: Iterable[str] = (), filters: third_party_pandas_gbq.FiltersType = (), + columns: Iterable[str] = (), + filters: third_party_pandas_gbq.FiltersType = (), # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. query_or_table = self._filters_to_query(query_or_table, columns, filters) + query_or_table = self._filters_to_query(query_or_table, columns, filters) if _is_query(query_or_table): return self._read_gbq_query( query_or_table, @@ -310,8 +313,10 @@ def read_gbq( api_name="read_gbq", ) + def _filters_to_query(self, query_or_table, columns, filters): def _filters_to_query(self, query_or_table, columns, filters): """Convert filters to query""" + if len(filters) == 0 and len(columns) == 0: if len(filters) == 0 and len(columns) == 0: return query_or_table @@ -321,6 +326,9 @@ def _filters_to_query(self, query_or_table, columns, filters): select_clause = "SELECT " + ( ", ".join(f"`{column}`" for column in columns) if columns else "*" ) + select_clause = "SELECT " + ( + ", ".join(f"`{column}`" for column in columns) if columns else "*" + ) where_clause = "" if filters: @@ -335,6 +343,22 @@ def _filters_to_query(self, query_or_table, columns, filters): "!=": "!=", } + if ( + isinstance(filters, Iterable) + and isinstance(filters[0], Tuple) + and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple)) + ): + valid_operators = { + "in": "IN", + "not in": "NOT IN", + "==": "=", + ">": ">", + "<": "<", + ">=": ">=", + "<=": "<=", + "!=": "!=", + } + if ( isinstance(filters, Iterable) and isinstance(filters[0], Tuple) @@ -342,35 +366,56 @@ def _filters_to_query(self, query_or_table, columns, filters): ): filters = [filters] + or_expressions = [] or_expressions = [] for group in filters: if not isinstance(group, Iterable): - raise ValueError("Each filter group should be a iterable.") + raise ValueError( + f"Filter group should be a iterable, {group} is not valid." + ) + and_expressions = [] and_expressions = [] for filter_item in group: - if not isinstance(filter_item, tuple): - raise ValueError("Each filter condition should be a tuple.") + if not isinstance(filter_item, tuple) or (len(filter_item) != 3): + raise ValueError( + "Filter condition should be a tuple of length 3, {filter_item} is not valid." + ) column, operator, value = filter_item + if not isinstance(column, str): + raise ValueError( + f"Column name should be a string, but received '{column}' of type {type(column).__name__}." + ) + if operator not in valid_operators: raise ValueError(f"Operator {operator} is not valid.") operator = valid_operators[operator] + operator = valid_operators[operator] + if operator in ["IN", "NOT IN"]: value_list = ", ".join([repr(v) for v in value]) expression = f"`{column}` {operator} ({value_list})" + value_list = ", ".join([repr(v) for v in value]) + expression = f"`{column}` {operator} ({value_list})" else: expression = f"`{column}` {operator} {repr(value)}" and_expressions.append(expression) + expression = f"`{column}` {operator} {repr(value)}" + and_expressions.append(expression) or_expressions.append(" AND ".join(and_expressions)) + or_expressions.append(" AND ".join(and_expressions)) if or_expressions: where_clause = " WHERE " + " OR ".join(or_expressions) + if or_expressions: + where_clause = " WHERE " + " OR ".join(or_expressions) + full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" return full_query From ad6d37f0bc852a90b95aea2c9a3b7b01dbe1fcf4 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 07/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/session/__init__.py | 38 ++--------------------------------- 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 35a5bdd0d5..2db92139ea 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -286,13 +286,11 @@ def read_gbq( max_results: Optional[int] = None, columns: Iterable[str] = (), filters: third_party_pandas_gbq.FiltersType = (), - columns: Iterable[str] = (), - filters: third_party_pandas_gbq.FiltersType = (), # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. query_or_table = self._filters_to_query(query_or_table, columns, filters) - query_or_table = self._filters_to_query(query_or_table, columns, filters) + if _is_query(query_or_table): return self._read_gbq_query( query_or_table, @@ -313,19 +311,15 @@ def read_gbq( api_name="read_gbq", ) - def _filters_to_query(self, query_or_table, columns, filters): def _filters_to_query(self, query_or_table, columns, filters): """Convert filters to query""" - if len(filters) == 0 and len(columns) == 0: if len(filters) == 0 and len(columns) == 0: return query_or_table sub_query = ( f"({query_or_table})" if _is_query(query_or_table) else query_or_table ) - select_clause = "SELECT " + ( - ", ".join(f"`{column}`" for column in columns) if columns else "*" - ) + select_clause = "SELECT " + ( ", ".join(f"`{column}`" for column in columns) if columns else "*" ) @@ -343,22 +337,6 @@ def _filters_to_query(self, query_or_table, columns, filters): "!=": "!=", } - if ( - isinstance(filters, Iterable) - and isinstance(filters[0], Tuple) - and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple)) - ): - valid_operators = { - "in": "IN", - "not in": "NOT IN", - "==": "=", - ">": ">", - "<": "<", - ">=": ">=", - "<=": "<=", - "!=": "!=", - } - if ( isinstance(filters, Iterable) and isinstance(filters[0], Tuple) @@ -366,7 +344,6 @@ def _filters_to_query(self, query_or_table, columns, filters): ): filters = [filters] - or_expressions = [] or_expressions = [] for group in filters: if not isinstance(group, Iterable): @@ -374,7 +351,6 @@ def _filters_to_query(self, query_or_table, columns, filters): f"Filter group should be a iterable, {group} is not valid." ) - and_expressions = [] and_expressions = [] for filter_item in group: if not isinstance(filter_item, tuple) or (len(filter_item) != 3): @@ -394,28 +370,18 @@ def _filters_to_query(self, query_or_table, columns, filters): operator = valid_operators[operator] - operator = valid_operators[operator] - if operator in ["IN", "NOT IN"]: value_list = ", ".join([repr(v) for v in value]) expression = f"`{column}` {operator} ({value_list})" - value_list = ", ".join([repr(v) for v in value]) - expression = f"`{column}` {operator} ({value_list})" else: expression = f"`{column}` {operator} {repr(value)}" and_expressions.append(expression) - expression = f"`{column}` {operator} {repr(value)}" - and_expressions.append(expression) or_expressions.append(" AND ".join(and_expressions)) - or_expressions.append(" AND ".join(and_expressions)) if or_expressions: where_clause = " WHERE " + " OR ".join(or_expressions) - if or_expressions: - where_clause = " WHERE " + " OR ".join(or_expressions) - full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" return full_query From 34737805bf4d77467567fda33357120e9ab7c789 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 08/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2db92139ea..d8bf91f201 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -355,7 +355,7 @@ def _filters_to_query(self, query_or_table, columns, filters): for filter_item in group: if not isinstance(filter_item, tuple) or (len(filter_item) != 3): raise ValueError( - "Filter condition should be a tuple of length 3, {filter_item} is not valid." + f"Filter condition should be a tuple of length 3, {filter_item} is not valid." ) column, operator, value = filter_item From 276bfd027f22bcc2ad9579043ab6d031b4afc103 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 09/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/pandas/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index a8615ef8b1..ab5b3feb32 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -59,7 +59,7 @@ import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile -from third_party.bigframes_vendored.pandas.io.gbq import FiltersType +import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq # Include method definition so that the method appears in our docs for @@ -488,7 +488,7 @@ def read_gbq( col_order: Iterable[str] = (), max_results: Optional[int] = None, columns: Iterable[str] = (), - filters: FiltersType = (), + filters: vendored_pandas_gbq.FiltersType = (), ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( From 8a4e9401f0b99406febd70946b26c7004abdffd6 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 10/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_session.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index adba686b4a..3a2c5d6f65 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -331,11 +331,11 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): pytest.param( "{scalars_table_id}", [ - ("rowindex", "not in", [0, 6]), - ("string_col", "in", ["Hello, World!", "こんにちは"]), + (("rowindex", "not in", [0, 6])), + (("string_col", "in", ["Hello, World!", "こんにちは"])), ], lambda row: row["rowindex"] not in [0, 6] - and row["string_col"] in ["Hello, World!", "こんにちは"], + or row["string_col"] in ["Hello, World!", "こんにちは"], id="or_operation", ), pytest.param( From c00a05e17536c848a084f77c1947d4e8713004b6 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 11/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_session.py | 58 ------------------- tests/unit/session/test_session.py | 57 ++++++++++++++++++ .../bigframes_vendored/pandas/io/gbq.py | 32 ++++------ 3 files changed, 67 insertions(+), 80 deletions(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 3a2c5d6f65..bf72e444eb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -309,64 +309,6 @@ def test_read_gbq_w_script_no_select(session, dataset_id: str): assert df["statement_type"][0] == "SCRIPT" -@pytest.mark.parametrize( - ("query_or_table", "filters", "validator"), - [ - pytest.param( - """SELECT - rowindex, - string_col, - FROM `{scalars_table_id}` AS t - """, - [("rowindex", "<", 4), ("string_col", "==", "Hello, World!")], - lambda row: row["rowindex"] < 4 and row["string_col"] == "Hello, World!", - id="query_input", - ), - pytest.param( - "{scalars_table_id}", - [("date_col", ">", "2022-10-20")], - lambda row: pd.to_datetime(row["date_col"]) > pd.to_datetime("2022-10-20"), - id="table_input", - ), - pytest.param( - "{scalars_table_id}", - [ - (("rowindex", "not in", [0, 6])), - (("string_col", "in", ["Hello, World!", "こんにちは"])), - ], - lambda row: row["rowindex"] not in [0, 6] - or row["string_col"] in ["Hello, World!", "こんにちは"], - id="or_operation", - ), - pytest.param( - "{scalars_table_id}", - ["date_col", ">", "2022-10-20"], - None, - marks=pytest.mark.xfail( - raises=ValueError, - ), - id="raise_error", - ), - ], -) -def test_read_gbq_with_filters( - session, scalars_table_id: str, query_or_table, filters, validator -): - df = session.read_gbq( - query_or_table.format(scalars_table_id=scalars_table_id), - filters=filters, - ) - - for _, row in df.iterrows(): - assert validator(row) - - -def test_read_gbq_with_columns_filter(session, scalars_table_id: str): - cols = ["int64_too", "string_col", "date_col"] - df = session.read_gbq(scalars_table_id, columns=cols) - assert list(df.columns) == cols - - def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 18fd42e0f3..d38a393f27 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -57,3 +57,60 @@ def test_session_init_fails_with_no_project(): credentials=mock.Mock(spec=google.auth.credentials.Credentials) ) ) + + +@pytest.mark.parametrize( + ("query_or_table", "columns", "filters", "expected_output"), + [ + pytest.param( + """SELECT + rowindex, + string_col, + FROM `test_table` AS t + """, + [], + [("rowindex", "<", 4), ("string_col", "==", "Hello, World!")], + """SELECT * FROM (SELECT + rowindex, + string_col, + FROM `test_table` AS t + ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""", + id="query_input", + ), + pytest.param( + "test_table", + [], + [("date_col", ">", "2022-10-20")], + "SELECT * FROM test_table AS sub WHERE `date_col` > '2022-10-20'", + id="table_input", + ), + pytest.param( + "test_table", + ["row_index", "string_col"], + [ + (("rowindex", "not in", [0, 6]),), + (("string_col", "in", ["Hello, World!", "こんにちは"]),), + ], + ( + "SELECT `row_index`, `string_col` FROM test_table AS sub WHERE " + "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " + "'こんにちは')" + ), + id="or_operation", + ), + pytest.param( + "test_table", + [], + ["date_col", ">", "2022-10-20"], + None, + marks=pytest.mark.xfail( + raises=ValueError, + ), + id="raise_error", + ), + ], +) +def test_read_gbq_with_filters(query_or_table, columns, filters, expected_output): + session = resources.create_bigquery_session() + query = session._filters_to_query(query_or_table, columns, filters) + assert query == expected_output diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 6ee5cf4166..d18490b408 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -7,20 +7,8 @@ from bigframes import constants -FiltersType = ( - Iterable[ - Union[ - Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any], - Iterable[ - Tuple[ - str, - Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], - Any, - ] - ], - ] - ], -) +FilterType = Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any] +FiltersType = Iterable[Union[FilterType, Iterable[FilterType]]] class GBQIOMixin: @@ -100,15 +88,15 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. - columns(Iterable[str], default ()): If not empty, only these columns + columns (Iterable[str], default ()): If not empty, only these columns will be read from table. - filters (List[Tuple], default ()): To filter out data. Filter syntax: - [[(column, op, val), …],…] where op is [==, >, >=, <, <=, !=, in, - not in] The innermost tuples are transposed into a set of filters - applied through an AND operation. The outer list combines these - sets of filters through an OR operation. A single list of tuples - can also be used, meaning that no OR operation between set of - filters is to be conducted. + filters (Iterable[Iterable[[Tuple]], default ()): To filter out data. + Filter syntax: [[(column, op, val), …],…] where op is [==, >, >=, + <, <=, !=, in, not in] The innermost tuples are transposed into a + set of filters applied through an AND operation. The outer list + combines these sets of filters through an OR operation. A single + list of tuples can also be used, meaning that no OR operation + between set of filters is to be conducted. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. From dd94369b3f36457f9d35ef3d7b69112ed3f22dd1 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 12/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- .../bigframes_vendored/pandas/io/gbq.py | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index d18490b408..2a41e4e4f7 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -75,6 +75,18 @@ def read_gbq( [2 rows x 3 columns] + Reading data with `columns` and `filters` parameters: + + >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] + >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] + >>> df = bpd.read_gbq("bigquery-public-data.baseball.games_wide", columns=columns, filters=filters) + >>> df.head(1) + + pitcherFirstName pitcherLastName year pitchSpeed + 0 John Axford 2016 98 + + [1 rows x 4 columns in total] + Args: query_or_table (str): A SQL string to be executed or a BigQuery table to be read. The @@ -90,13 +102,14 @@ def read_gbq( query results. columns (Iterable[str], default ()): If not empty, only these columns will be read from table. - filters (Iterable[Iterable[[Tuple]], default ()): To filter out data. - Filter syntax: [[(column, op, val), …],…] where op is [==, >, >=, - <, <=, !=, in, not in] The innermost tuples are transposed into a - set of filters applied through an AND operation. The outer list - combines these sets of filters through an OR operation. A single - list of tuples can also be used, meaning that no OR operation - between set of filters is to be conducted. + filters (Iterable[Union[Tuple, Iterable[Tuple]]], default ()): To + filter out data. Filter syntax: [[(column, op, val), …],…] where + op is [==, >, >=, <, <=, !=, in, not in]. The innermost tuples + are transposed into a set of filters applied through an AND + operation. The outer Iterable combines these sets of filters + through an OR operation. A single Iterable of tuples can also + be used, meaning that no OR operation between set of filters + is to be conducted. Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. From 54ca688b210dfa30c2d9232d5cf5be0f872484a8 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 13/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- third_party/bigframes_vendored/pandas/io/gbq.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 2a41e4e4f7..c937b273d3 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -79,7 +79,11 @@ def read_gbq( >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] - >>> df = bpd.read_gbq("bigquery-public-data.baseball.games_wide", columns=columns, filters=filters) + >>> df = bpd.read_gbq( + ... "bigquery-public-data.baseball.games_wide", + ... columns=columns, + ... filters=filters, + ... ) >>> df.head(1) pitcherFirstName pitcherLastName year pitchSpeed From 95e318ba6fc8190e5756ff3c7d3a5b8d961fa2be Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 20 Nov 2023 20:36:53 +0000 Subject: [PATCH 14/20] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- third_party/bigframes_vendored/pandas/io/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index c937b273d3..73db7bdb3b 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -80,8 +80,8 @@ def read_gbq( >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] >>> df = bpd.read_gbq( - ... "bigquery-public-data.baseball.games_wide", - ... columns=columns, + ... "bigquery-public-data.baseball.games_wide", + ... columns=columns, ... filters=filters, ... ) >>> df.head(1) From ced491f50b29ee8423ec8d91c7f6af73a0cd2f2b Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 13 Nov 2023 14:40:18 +0000 Subject: [PATCH 15/20] feat: Add filters and columns arguments to read_gbq for enhanced data querying See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- third_party/bigframes_vendored/pandas/io/gbq.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 73db7bdb3b..c22aedbb20 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -80,16 +80,15 @@ def read_gbq( >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] >>> df = bpd.read_gbq( - ... "bigquery-public-data.baseball.games_wide", - ... columns=columns, + ... "bigquery-public-data.baseball.games_wide", + ... columns=columns, ... filters=filters, ... ) >>> df.head(1) - - pitcherFirstName pitcherLastName year pitchSpeed - 0 John Axford 2016 98 + pitcherFirstName pitcherLastName year pitchSpeed + 0 John Axford 2016 98 - [1 rows x 4 columns in total] + [1 rows x 4 columns] Args: query_or_table (str): From 0f2840d3d400da67f11eb26bda1eb88dc185f829 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 20 Nov 2023 21:24:22 +0000 Subject: [PATCH 16/20] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- third_party/bigframes_vendored/pandas/io/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index c22aedbb20..643a16e45a 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -80,8 +80,8 @@ def read_gbq( >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] >>> df = bpd.read_gbq( - ... "bigquery-public-data.baseball.games_wide", - ... columns=columns, + ... "bigquery-public-data.baseball.games_wide", + ... columns=columns, ... filters=filters, ... ) >>> df.head(1) From 82f74fdd4950f8f4fb9dbd18e1f52dc00c137496 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 12 Dec 2023 23:41:50 +0000 Subject: [PATCH 17/20] update docstring --- third_party/bigframes_vendored/pandas/io/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index ea1533a452..3d5a198e58 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -79,7 +79,7 @@ def read_gbq( Reading data with `columns` and `filters` parameters: >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] - >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe'])] + >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] >>> df = bpd.read_gbq( ... "bigquery-public-data.baseball.games_wide", ... columns=columns, @@ -87,7 +87,7 @@ def read_gbq( ... ) >>> df.head(1) pitcherFirstName pitcherLastName year pitchSpeed - 0 John Axford 2016 98 + 0 John Gant 2016 82 [1 rows x 4 columns] From 354fd8e5878a0ab3b4fc90feb0ae4ededd7feaf5 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 13 Dec 2023 17:57:00 +0000 Subject: [PATCH 18/20] remove columns input --- bigframes/pandas/__init__.py | 2 -- bigframes/session/__init__.py | 5 +++-- third_party/bigframes_vendored/pandas/io/gbq.py | 7 ++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 427d62fef6..c9640abb23 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -487,7 +487,6 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - columns: Iterable[str] = (), filters: vendored_pandas_gbq.FiltersType = (), use_cache: bool = True, ) -> bigframes.dataframe.DataFrame: @@ -498,7 +497,6 @@ def read_gbq( index_col=index_col, col_order=col_order, max_results=max_results, - columns=columns, filters=filters, use_cache=use_cache, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index bdfa9e48bb..65fe451204 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -233,13 +233,14 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - columns: Iterable[str] = (), filters: third_party_pandas_gbq.FiltersType = (), use_cache: bool = True, # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - query_or_table = self._filters_to_query(query_or_table, columns, filters) + query_or_table = self._filters_to_query( + query_or_table, columns=col_order, filters=filters + ) if _is_query(query_or_table): return self._read_gbq_query( diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3d5a198e58..dc8bcc1f77 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -19,7 +19,6 @@ def read_gbq( index_col: Iterable[str] | str = (), col_order: Iterable[str] = (), max_results: Optional[int] = None, - columns: Iterable[str] = (), filters: FiltersType = (), use_cache: bool = True, ): @@ -78,11 +77,11 @@ def read_gbq( Reading data with `columns` and `filters` parameters: - >>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] + >>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] >>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] >>> df = bpd.read_gbq( ... "bigquery-public-data.baseball.games_wide", - ... columns=columns, + ... col_order=col_order, ... filters=filters, ... ) >>> df.head(1) @@ -104,8 +103,6 @@ def read_gbq( max_results (Optional[int], default None): If set, limit the maximum number of rows to fetch from the query results. - columns (Iterable[str], default ()): If not empty, only these columns - will be read from table. filters (Iterable[Union[Tuple, Iterable[Tuple]]], default ()): To filter out data. Filter syntax: [[(column, op, val), …],…] where op is [==, >, >=, <, <=, !=, in, not in]. The innermost tuples From 434c5599a0e2b15001aa3a8466667791b7ab100a Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 13 Dec 2023 18:20:43 +0000 Subject: [PATCH 19/20] make filter_to_query run only when there are filters --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 65fe451204..e49912775b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -266,7 +266,7 @@ def read_gbq( def _filters_to_query(self, query_or_table, columns, filters): """Convert filters to query""" - if len(filters) == 0 and len(columns) == 0: + if len(filters) == 0: return query_or_table sub_query = ( From c17b81510ab536eb424db97325f7e2a56c36ea21 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 13 Dec 2023 18:21:57 +0000 Subject: [PATCH 20/20] remove named input --- bigframes/session/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index e49912775b..5364060d1c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -238,9 +238,7 @@ def read_gbq( # Add a verify index argument that fails if the index is not unique. ) -> dataframe.DataFrame: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. - query_or_table = self._filters_to_query( - query_or_table, columns=col_order, filters=filters - ) + query_or_table = self._filters_to_query(query_or_table, col_order, filters) if _is_query(query_or_table): return self._read_gbq_query(