diff --git a/CHANGELOG.md b/CHANGELOG.md index 374448a5e..62352c344 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,20 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.36.0](https://github.com/googleapis/python-bigquery/compare/v3.35.1...v3.36.0) (2025-08-20) + + +### Features + +* Add created/started/ended properties to RowIterator. ([#2260](https://github.com/googleapis/python-bigquery/issues/2260)) ([0a95b24](https://github.com/googleapis/python-bigquery/commit/0a95b24192395cc3ccf801aa9bc318999873a2bf)) +* Retry query jobs if `jobBackendError` or `jobInternalError` are encountered ([#2256](https://github.com/googleapis/python-bigquery/issues/2256)) ([3deff1d](https://github.com/googleapis/python-bigquery/commit/3deff1d963980800e8b79fa3aaf5b712d4fd5062)) + + +### Documentation + +* Add a TROUBLESHOOTING.md file with tips for logging ([#2262](https://github.com/googleapis/python-bigquery/issues/2262)) ([b684832](https://github.com/googleapis/python-bigquery/commit/b68483227693ea68f6b12eacca2be1803cffb1d1)) +* Update README to break infinite redirect loop ([#2254](https://github.com/googleapis/python-bigquery/issues/2254)) ([8f03166](https://github.com/googleapis/python-bigquery/commit/8f031666114a826da2ad965f8ecd4727466cb480)) + ## [3.35.1](https://github.com/googleapis/python-bigquery/compare/v3.35.0...v3.35.1) (2025-07-21) diff --git a/README.rst b/README.rst index 29e15e067..23ed9257d 100644 --- a/README.rst +++ b/README.rst @@ -18,7 +18,7 @@ processing power of Google's infrastructure. .. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-bigquery.svg :target: https://pypi.org/project/google-cloud-bigquery/ .. _BigQuery: https://cloud.google.com/bigquery/what-is-bigquery -.. _Client Library Documentation: https://googleapis.dev/python/bigquery/latest +.. _Client Library Documentation: https://cloud.google.com/python/docs/reference/bigquery/latest/summary_overview .. _Product Documentation: https://cloud.google.com/bigquery/docs/reference/v2/ Quick Start diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 000000000..7da12c440 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,34 @@ +# Troubleshooting steps + +## Enable logging of BQ Storage Read API session creation + +It can be helpful to get the BQ Storage Read API session to allow the BigQuery +backend team to debug cases of API instability. The logs that share the session +creation are in a module-specific logger. To enable the logs, refer to the +following code sample: + +```python +import logging +import google.cloud.bigquery + +# Configure the basic logging to show DEBUG level messages +log_formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s' +) +handler = logging.StreamHandler() +handler.setFormatter(log_formatter) +default_logger = logging.getLogger() +default_logger.setLevel(logging.DEBUG) +default_logger.addHandler(handler) +to_dataframe_logger = logging.getLogger("google.cloud.bigquery._pandas_helpers") +to_dataframe_logger.setLevel(logging.DEBUG) +to_dataframe_logger.addHandler(handler) + +# Example code that touches the BQ Storage Read API. +bqclient = google.cloud.bigquery.Client() +results = bqclient.query_and_wait("SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`") +print(results.to_dataframe().head()) +``` + +In particular, watch for the text "with BQ Storage API session" in the logs +to get the streaming API session ID to share with your support person. diff --git a/google/cloud/bigquery/_job_helpers.py b/google/cloud/bigquery/_job_helpers.py index 73d4f6e7b..6fd561f8c 100644 --- a/google/cloud/bigquery/_job_helpers.py +++ b/google/cloud/bigquery/_job_helpers.py @@ -35,17 +35,22 @@ predicates where it is safe to generate a new query ID. """ +from __future__ import annotations + import copy +import dataclasses +import datetime import functools import uuid import textwrap -from typing import Any, Dict, Optional, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, Optional, TYPE_CHECKING, Union import warnings import google.api_core.exceptions as core_exceptions from google.api_core import retry as retries from google.cloud.bigquery import job +import google.cloud.bigquery.job.query import google.cloud.bigquery.query from google.cloud.bigquery import table import google.cloud.bigquery.retry @@ -116,14 +121,21 @@ def query_jobs_insert( retry: Optional[retries.Retry], timeout: Optional[float], job_retry: Optional[retries.Retry], + *, + callback: Callable = lambda _: None, ) -> job.QueryJob: """Initiate a query using jobs.insert. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + + Args: + callback (Callable): + A callback function used by bigframes to report query progress. """ job_id_given = job_id is not None job_id_save = job_id job_config_save = job_config + query_sent_factory = QuerySentEventFactory() def do_query(): # Make a copy now, so that original doesn't get changed by the process @@ -136,6 +148,16 @@ def do_query(): try: query_job._begin(retry=retry, timeout=timeout) + if job_config is not None and not job_config.dry_run: + callback( + query_sent_factory( + query=query, + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + request_id=None, + ) + ) except core_exceptions.Conflict as create_exc: # The thought is if someone is providing their own job IDs and they get # their job ID generation wrong, this could end up returning results for @@ -396,6 +418,7 @@ def query_and_wait( job_retry: Optional[retries.Retry], page_size: Optional[int] = None, max_results: Optional[int] = None, + callback: Callable = lambda _: None, ) -> table.RowIterator: """Run the query, wait for it to finish, and return the results. @@ -415,9 +438,8 @@ def query_and_wait( location (Optional[str]): Location where to run the job. Must match the location of the table used in the query as well as the destination table. - project (Optional[str]): - Project ID of the project of where to run the job. Defaults - to the client's project. + project (str): + Project ID of the project of where to run the job. api_timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. @@ -441,6 +463,8 @@ def query_and_wait( request. Non-positive values are ignored. max_results (Optional[int]): The maximum total number of rows from this request. + callback (Callable): + A callback function used by bigframes to report query progress. Returns: google.cloud.bigquery.table.RowIterator: @@ -479,12 +503,14 @@ def query_and_wait( retry=retry, timeout=api_timeout, job_retry=job_retry, + callback=callback, ), api_timeout=api_timeout, wait_timeout=wait_timeout, retry=retry, page_size=page_size, max_results=max_results, + callback=callback, ) path = _to_query_path(project) @@ -496,10 +522,24 @@ def query_and_wait( if client.default_job_creation_mode: request_body["jobCreationMode"] = client.default_job_creation_mode + query_sent_factory = QuerySentEventFactory() + def do_query(): - request_body["requestId"] = make_job_id() + request_id = make_job_id() + request_body["requestId"] = request_id span_attributes = {"path": path} + if "dryRun" not in request_body: + callback( + query_sent_factory( + query=query, + billing_project=project, + location=location, + job_id=None, + request_id=request_id, + ) + ) + # For easier testing, handle the retries ourselves. if retry is not None: response = retry(client._call_api)( @@ -542,8 +582,25 @@ def do_query(): retry=retry, page_size=page_size, max_results=max_results, + callback=callback, ) + if "dryRun" not in request_body: + callback( + QueryFinishedEvent( + billing_project=project, + location=query_results.location, + query_id=query_results.query_id, + job_id=query_results.job_id, + total_rows=query_results.total_rows, + total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, + destination=None, + created=query_results.created, + started=query_results.started, + ended=query_results.ended, + ) + ) return table.RowIterator( client=client, api_request=functools.partial(client._call_api, retry, timeout=api_timeout), @@ -561,6 +618,9 @@ def do_query(): query=query, total_bytes_processed=query_results.total_bytes_processed, slot_millis=query_results.slot_millis, + created=query_results.created, + started=query_results.started, + ended=query_results.ended, ) if job_retry is not None: @@ -611,6 +671,8 @@ def _wait_or_cancel( retry: Optional[retries.Retry], page_size: Optional[int], max_results: Optional[int], + *, + callback: Callable = lambda _: None, ) -> table.RowIterator: """Wait for a job to complete and return the results. @@ -618,12 +680,43 @@ def _wait_or_cancel( the job. """ try: - return job.result( + if not job.dry_run: + callback( + QueryReceivedEvent( + billing_project=job.project, + location=job.location, + job_id=job.job_id, + statement_type=job.statement_type, + state=job.state, + query_plan=job.query_plan, + created=job.created, + started=job.started, + ended=job.ended, + ) + ) + query_results = job.result( page_size=page_size, max_results=max_results, retry=retry, timeout=wait_timeout, ) + if not job.dry_run: + callback( + QueryFinishedEvent( + billing_project=job.project, + location=query_results.location, + query_id=query_results.query_id, + job_id=query_results.job_id, + total_rows=query_results.total_rows, + total_bytes_processed=query_results.total_bytes_processed, + slot_millis=query_results.slot_millis, + destination=job.destination, + created=job.created, + started=job.started, + ended=job.ended, + ) + ) + return query_results except Exception: # Attempt to cancel the job since we can't return the results. try: @@ -632,3 +725,62 @@ def _wait_or_cancel( # Don't eat the original exception if cancel fails. pass raise + + +@dataclasses.dataclass(frozen=True) +class QueryFinishedEvent: + """Query finished successfully.""" + + billing_project: Optional[str] + location: Optional[str] + query_id: Optional[str] + job_id: Optional[str] + destination: Optional[table.TableReference] + total_rows: Optional[int] + total_bytes_processed: Optional[int] + slot_millis: Optional[int] + created: Optional[datetime.datetime] + started: Optional[datetime.datetime] + ended: Optional[datetime.datetime] + + +@dataclasses.dataclass(frozen=True) +class QueryReceivedEvent: + """Query received and acknowledged by the BigQuery API.""" + + billing_project: Optional[str] + location: Optional[str] + job_id: Optional[str] + statement_type: Optional[str] + state: Optional[str] + query_plan: Optional[list[google.cloud.bigquery.job.query.QueryPlanEntry]] + created: Optional[datetime.datetime] + started: Optional[datetime.datetime] + ended: Optional[datetime.datetime] + + +@dataclasses.dataclass(frozen=True) +class QuerySentEvent: + """Query sent to BigQuery.""" + + query: str + billing_project: Optional[str] + location: Optional[str] + job_id: Optional[str] + request_id: Optional[str] + + +class QueryRetryEvent(QuerySentEvent): + """Query sent another time because the previous attempt failed.""" + + +class QuerySentEventFactory: + """Creates a QuerySentEvent first, then QueryRetryEvent after that.""" + + def __init__(self): + self._event_constructor = QuerySentEvent + + def __call__(self, **kwargs): + result = self._event_constructor(**kwargs) + self._event_constructor = QueryRetryEvent + return result diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 804f77ea2..4ca2cb428 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -15,6 +15,7 @@ """Client for interacting with the Google BigQuery API.""" from __future__ import absolute_import +from __future__ import annotations from __future__ import division from collections import abc as collections_abc @@ -31,6 +32,7 @@ import typing from typing import ( Any, + Callable, Dict, IO, Iterable, @@ -3633,8 +3635,8 @@ def query_and_wait( rate-limit-exceeded errors. Passing ``None`` disables job retry. Not all jobs can be retried. page_size (Optional[int]): - The maximum number of rows in each page of results from this - request. Non-positive values are ignored. + The maximum number of rows in each page of results from the + initial jobs.query request. Non-positive values are ignored. max_results (Optional[int]): The maximum total number of rows from this request. @@ -3656,6 +3658,39 @@ def query_and_wait( :class:`~google.cloud.bigquery.job.QueryJobConfig` class. """ + return self._query_and_wait_bigframes( + query, + job_config=job_config, + location=location, + project=project, + api_timeout=api_timeout, + wait_timeout=wait_timeout, + retry=retry, + job_retry=job_retry, + page_size=page_size, + max_results=max_results, + ) + + def _query_and_wait_bigframes( + self, + query, + *, + job_config: Optional[QueryJobConfig] = None, + location: Optional[str] = None, + project: Optional[str] = None, + api_timeout: TimeoutType = DEFAULT_TIMEOUT, + wait_timeout: Union[Optional[float], object] = POLLING_DEFAULT_VALUE, + retry: retries.Retry = DEFAULT_RETRY, + job_retry: retries.Retry = DEFAULT_JOB_RETRY, + page_size: Optional[int] = None, + max_results: Optional[int] = None, + callback: Callable = lambda _: None, + ) -> RowIterator: + """See query_and_wait. + + This method has an extra callback parameter, which is used by bigframes + to create better progress bars. + """ if project is None: project = self.project @@ -3681,6 +3716,7 @@ def query_and_wait( job_retry=job_retry, page_size=page_size, max_results=max_results, + callback=callback, ) def insert_rows( @@ -4145,6 +4181,9 @@ def _list_rows_from_query_results( query: Optional[str] = None, total_bytes_processed: Optional[int] = None, slot_millis: Optional[int] = None, + created: Optional[datetime.datetime] = None, + started: Optional[datetime.datetime] = None, + ended: Optional[datetime.datetime] = None, ) -> RowIterator: """List the rows of a completed query. See @@ -4198,6 +4237,12 @@ def _list_rows_from_query_results( total bytes processed from job statistics, if present. slot_millis (Optional[int]): Number of slot ms the user is actually billed for. + created (Optional[datetime.datetime]): + Datetime at which the job was created. + started (Optional[datetime.datetime]): + Datetime at which the job was started. + ended (Optional[datetime.datetime]): + Datetime at which the job finished. Returns: google.cloud.bigquery.table.RowIterator: @@ -4238,6 +4283,9 @@ def _list_rows_from_query_results( query=query, total_bytes_processed=total_bytes_processed, slot_millis=slot_millis, + created=created, + started=started, + ended=ended, ) return row_iterator diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index ec9379ea9..b377f979d 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1550,6 +1550,8 @@ def result( # type: ignore # (incompatible with supertype) return _EmptyRowIterator( project=self.project, location=self.location, + schema=self.schema, + total_bytes_processed=self.total_bytes_processed, # Intentionally omit job_id and query_id since this doesn't # actually correspond to a finished query job. ) @@ -1737,7 +1739,11 @@ def is_job_done(): project=self.project, job_id=self.job_id, query_id=self.query_id, + schema=self.schema, num_dml_affected_rows=self._query_results.num_dml_affected_rows, + query=self.query, + total_bytes_processed=self.total_bytes_processed, + slot_millis=self.slot_millis, ) # We know that there's at least 1 row, so only treat the response from @@ -1767,6 +1773,9 @@ def is_job_done(): query=self.query, total_bytes_processed=self.total_bytes_processed, slot_millis=self.slot_millis, + created=self.created, + started=self.started, + ended=self.ended, **list_rows_kwargs, ) rows._preserve_order = _contains_order_by(self.query) diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index 4a006d621..7f70f6a2a 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -1228,11 +1228,18 @@ def location(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.job_reference + or https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.location Returns: str: Job ID of the query job. """ - return self._properties.get("jobReference", {}).get("location") + location = self._properties.get("jobReference", {}).get("location") + + # Sometimes there's no job, but we still want to get the location + # information. Prefer the value from job for backwards compatibilitity. + if not location: + location = self._properties.get("location") + return location @property def query_id(self) -> Optional[str]: @@ -1287,7 +1294,7 @@ def slot_millis(self): """Total number of slot ms the user is actually billed for. See: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.slot_millis + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.total_slot_ms Returns: Optional[int]: Count generated on the server (None until set by the server). @@ -1310,6 +1317,56 @@ def num_dml_affected_rows(self): if num_dml_affected_rows is not None: return int(num_dml_affected_rows) + @property + def created(self): + """Creation time of this query. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.creation_time + + Returns: + Optional[datetime.datetime]: + the creation time (None until set from the server). + """ + millis = self._properties.get("creationTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + + @property + def started(self): + """Start time of this query. + + This field will be present when the query transitions from the + PENDING state to either RUNNING or DONE. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.start_time + + Returns: + Optional[datetime.datetime]: + the start time (None until set from the server). + """ + millis = self._properties.get("startTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + + @property + def ended(self): + """End time of this query. + + This field will be present whenever a query is in the DONE state. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.end_time + + Returns: + Optional[datetime.datetime]: + the end time (None until set from the server). + """ + millis = self._properties.get("endTime") + if millis is not None: + return _helpers._datetime_from_microseconds(int(millis) * 1000.0) + @property def rows(self): """Query results. diff --git a/google/cloud/bigquery/retry.py b/google/cloud/bigquery/retry.py index 999d0e851..8f469f2d3 100644 --- a/google/cloud/bigquery/retry.py +++ b/google/cloud/bigquery/retry.py @@ -124,6 +124,8 @@ def _should_retry_get_job_conflict(exc): "rateLimitExceeded", "backendError", "internalError", + "jobBackendError", + "jobInternalError", "jobRateLimitExceeded", ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index dbdde36d1..219b31467 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1788,7 +1788,15 @@ class RowIterator(HTTPIterator): query (Optional[str]): The query text used. total_bytes_processed (Optional[int]): - total bytes processed from job statistics, if present. + If representing query results, the total bytes processed by the associated query. + slot_millis (Optional[int]): + If representing query results, the number of slot ms billed for the associated query. + created (Optional[datetime.datetime]): + If representing query results, the creation time of the associated query. + started (Optional[datetime.datetime]): + If representing query results, the start time of the associated query. + ended (Optional[datetime.datetime]): + If representing query results, the end time of the associated query. """ def __init__( @@ -1813,6 +1821,9 @@ def __init__( query: Optional[str] = None, total_bytes_processed: Optional[int] = None, slot_millis: Optional[int] = None, + created: Optional[datetime.datetime] = None, + started: Optional[datetime.datetime] = None, + ended: Optional[datetime.datetime] = None, ): super(RowIterator, self).__init__( client, @@ -1826,7 +1837,7 @@ def __init__( page_start=_rows_page_start, next_token="pageToken", ) - schema = _to_schema_fields(schema) + schema = _to_schema_fields(schema) if schema else () self._field_to_index = _helpers._field_to_index_mapping(schema) self._page_size = page_size self._preserve_order = False @@ -1843,6 +1854,9 @@ def __init__( self._query = query self._total_bytes_processed = total_bytes_processed self._slot_millis = slot_millis + self._job_created = created + self._job_started = started + self._job_ended = ended @property def _billing_project(self) -> Optional[str]: @@ -1905,6 +1919,21 @@ def slot_millis(self) -> Optional[int]: """Number of slot ms the user is actually billed for.""" return self._slot_millis + @property + def created(self) -> Optional[datetime.datetime]: + """If representing query results, the creation time of the associated query.""" + return self._job_created + + @property + def started(self) -> Optional[datetime.datetime]: + """If representing query results, the start time of the associated query.""" + return self._job_started + + @property + def ended(self) -> Optional[datetime.datetime]: + """If representing query results, the end time of the associated query.""" + return self._job_ended + def _is_almost_completely_cached(self): """Check if all results are completely cached. @@ -2888,7 +2917,6 @@ class _EmptyRowIterator(RowIterator): statements. """ - schema = () pages = () total_rows = 0 diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index d565bc46e..a8f4c8e14 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.35.1" +__version__ = "3.36.0" diff --git a/samples/desktopapp/requirements.txt b/samples/desktopapp/requirements.txt index 54b708ca8..e7a02eca5 100644 --- a/samples/desktopapp/requirements.txt +++ b/samples/desktopapp/requirements.txt @@ -1,2 +1,2 @@ -google-cloud-bigquery==3.35.0 +google-cloud-bigquery==3.35.1 google-auth-oauthlib==1.2.2 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 5b85a9bfe..fa54cc229 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,5 +1,5 @@ attrs==25.3.0 -certifi==2025.7.14 +certifi==2025.8.3 cffi==1.17.1 charset-normalizer==3.4.2 click===8.1.8; python_version == '3.9' @@ -13,20 +13,20 @@ geopandas===1.0.1; python_version <= '3.9' geopandas==1.1.1; python_version >= '3.10' google-api-core==2.25.1 google-auth==2.40.3 -google-cloud-bigquery==3.35.0 +google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-core==2.4.3 google-crc32c==1.7.1 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 -grpcio==1.73.1 +grpcio==1.74.0 idna==3.10 munch==4.0.0 mypy-extensions==1.1.0 packaging==25.0 pandas==2.3.1 proto-plus==1.26.1 -pyarrow==20.0.0 +pyarrow==21.0.0 pyasn1==0.6.1 pyasn1-modules==0.4.2 pycparser==2.22 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index 5c48d707f..e7230053c 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,6 +1,6 @@ -bigquery_magics==0.10.1 +bigquery_magics==0.10.2 db-dtypes==1.4.3 -google.cloud.bigquery==3.35.0 +google.cloud.bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 ipython===8.18.1 pandas==2.3.1 diff --git a/samples/notebooks/requirements.txt b/samples/notebooks/requirements.txt index 88f725bb4..829f08f47 100644 --- a/samples/notebooks/requirements.txt +++ b/samples/notebooks/requirements.txt @@ -1,9 +1,9 @@ -bigquery-magics==0.10.1 +bigquery-magics==0.10.2 db-dtypes==1.4.3 -google-cloud-bigquery==3.35.0 +google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 ipython===8.18.1; python_version == '3.9' ipython==9.4.0; python_version >= '3.10' matplotlib===3.9.2; python_version == '3.9' -matplotlib==3.10.3; python_version >= '3.10' +matplotlib==3.10.5; python_version >= '3.10' pandas==2.3.1 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index e43cb04e9..afa62b6b8 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -google-cloud-bigquery==3.35.0 +google-cloud-bigquery==3.35.1 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ebe2d2a7a..5070a199b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -13,6 +13,7 @@ # limitations under the License. from unittest import mock +import threading import pytest @@ -24,6 +25,18 @@ def client(): yield make_client() +time_lock = threading.Lock() + + +@pytest.fixture +def global_time_lock(): + """Fixture to run tests serially that depend on the global time state, + such as tests of retry behavior. + """ + with time_lock: + yield + + @pytest.fixture def PROJECT(): yield "PROJECT" diff --git a/tests/unit/job/test_async_job_retry.py b/tests/unit/job/test_async_job_retry.py new file mode 100644 index 000000000..35041aa1b --- /dev/null +++ b/tests/unit/job/test_async_job_retry.py @@ -0,0 +1,139 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import google.api_core.retry +from google.api_core import exceptions + +from . import helpers +import google.cloud.bigquery.job + + +PROJECT = "test-project" +JOB_ID = "test-job-id" + + +def test_cancel_w_custom_retry(global_time_lock): + from google.cloud.bigquery.retry import DEFAULT_RETRY + + api_path = "/projects/{}/jobs/{}/cancel".format(PROJECT, JOB_ID) + resource = { + "jobReference": { + "jobId": JOB_ID, + "projectId": PROJECT, + "location": None, + }, + "configuration": {"test": True}, + } + expected = resource.copy() + expected["statistics"] = {} + response = {"job": resource} + conn = helpers.make_connection( + ValueError, + response, + ) + client = helpers._make_client(project=PROJECT, connection=conn) + job = google.cloud.bigquery.job._AsyncJob( + google.cloud.bigquery.job._JobReference(JOB_ID, PROJECT, "EU"), client + ) + + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, ValueError) + ) + + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + result = job.cancel(retry=retry, timeout=7.5) + + final_attributes.assert_called() + + assert result is True + assert job._properties == expected + conn.api_request.assert_has_calls( + [ + mock.call( + method="POST", + path=api_path, + query_params={"location": "EU"}, + timeout=7.5, + ), + mock.call( + method="POST", + path=api_path, + query_params={"location": "EU"}, + timeout=7.5, + ), # was retried once + ], + ) + + +def test_result_w_retry_wo_state(global_time_lock): + from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT + + begun_job_resource = helpers._make_job_resource( + job_id=JOB_ID, project_id=PROJECT, location="EU", started=True + ) + done_job_resource = helpers._make_job_resource( + job_id=JOB_ID, + project_id=PROJECT, + location="EU", + started=True, + ended=True, + ) + conn = helpers.make_connection( + exceptions.NotFound("not normally retriable"), + begun_job_resource, + exceptions.NotFound("not normally retriable"), + done_job_resource, + ) + client = helpers._make_client(project=PROJECT, connection=conn) + job = google.cloud.bigquery.job._AsyncJob( + google.cloud.bigquery.job._JobReference(JOB_ID, PROJECT, "EU"), client + ) + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + predicate=custom_predicate, + initial=0.001, + maximum=0.001, + deadline=0.1, + ) + assert job.result(retry=custom_retry) is job + + begin_call = mock.call( + method="POST", + path=f"/projects/{PROJECT}/jobs", + data={ + "jobReference": { + "jobId": JOB_ID, + "projectId": PROJECT, + "location": "EU", + } + }, + timeout=None, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={ + "projection": "full", + "location": "EU", + }, + timeout=DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + [begin_call, begin_call, reload_call, reload_call] + ) diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py index aa3d49ce3..f5861f645 100644 --- a/tests/unit/job/test_base.py +++ b/tests/unit/job/test_base.py @@ -17,8 +17,6 @@ import unittest from unittest import mock -from google.api_core import exceptions -import google.api_core.retry from google.api_core.future import polling import pytest @@ -882,50 +880,6 @@ def test_cancel_explicit(self): ) self.assertEqual(job._properties, expected) - def test_cancel_w_custom_retry(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - api_path = "/projects/{}/jobs/{}/cancel".format(self.PROJECT, self.JOB_ID) - resource = { - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": None, - }, - "configuration": {"test": True}, - } - expected = resource.copy() - expected["statistics"] = {} - response = {"job": resource} - job = self._set_properties_job() - - api_request_patcher = mock.patch.object( - job._client._connection, "api_request", side_effect=[ValueError, response] - ) - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, ValueError) - ) - - with api_request_patcher as fake_api_request: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - result = job.cancel(retry=retry, timeout=7.5) - - final_attributes.assert_called() - - self.assertTrue(result) - self.assertEqual(job._properties, expected) - self.assertEqual( - fake_api_request.call_args_list, - [ - mock.call(method="POST", path=api_path, query_params={}, timeout=7.5), - mock.call( - method="POST", path=api_path, query_params={}, timeout=7.5 - ), # was retried once - ], - ) - def test__set_future_result_wo_done(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, client) @@ -1069,64 +1023,6 @@ def test_result_default_wo_state(self): ) conn.api_request.assert_has_calls([begin_call, begin_call, reload_call]) - def test_result_w_retry_wo_state(self): - from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT - - begun_job_resource = _make_job_resource( - job_id=self.JOB_ID, project_id=self.PROJECT, location="EU", started=True - ) - done_job_resource = _make_job_resource( - job_id=self.JOB_ID, - project_id=self.PROJECT, - location="EU", - started=True, - ended=True, - ) - conn = make_connection( - exceptions.NotFound("not normally retriable"), - begun_job_resource, - exceptions.NotFound("not normally retriable"), - done_job_resource, - ) - client = _make_client(project=self.PROJECT, connection=conn) - job = self._make_one( - self._job_reference(self.JOB_ID, self.PROJECT, "EU"), client - ) - custom_predicate = mock.Mock() - custom_predicate.return_value = True - custom_retry = google.api_core.retry.Retry( - predicate=custom_predicate, - initial=0.001, - maximum=0.001, - deadline=0.1, - ) - self.assertIs(job.result(retry=custom_retry), job) - - begin_call = mock.call( - method="POST", - path=f"/projects/{self.PROJECT}/jobs", - data={ - "jobReference": { - "jobId": self.JOB_ID, - "projectId": self.PROJECT, - "location": "EU", - } - }, - timeout=None, - ) - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={ - "projection": "full", - "location": "EU", - }, - timeout=DEFAULT_GET_JOB_TIMEOUT, - ) - conn.api_request.assert_has_calls( - [begin_call, begin_call, reload_call, reload_call] - ) - def test_result_explicit_w_state(self): conn = make_connection() client = _make_client(project=self.PROJECT, connection=conn) diff --git a/tests/unit/job/test_query.py b/tests/unit/job/test_query.py index 7201adb55..ef6429598 100644 --- a/tests/unit/job/test_query.py +++ b/tests/unit/job/test_query.py @@ -20,15 +20,11 @@ import types from unittest import mock -import freezegun -from google.api_core import exceptions -import google.api_core.retry import requests from google.cloud.bigquery.client import _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS import google.cloud.bigquery._job_helpers import google.cloud.bigquery.query -import google.cloud.bigquery.retry from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT from google.cloud.bigquery.table import _EmptyRowIterator @@ -889,6 +885,9 @@ def test_result_reloads_job_state_until_done(self): job_resource_done = self._make_resource(started=True, ended=True, location="EU") job_resource_done["statistics"]["query"]["totalBytesProcessed"] = str(1234) job_resource_done["statistics"]["query"]["totalSlotMs"] = str(5678) + job_resource_done["statistics"]["creationTime"] = str(11) + job_resource_done["statistics"]["startTime"] = str(22) + job_resource_done["statistics"]["endTime"] = str(33) job_resource_done["configuration"]["query"]["destinationTable"] = { "projectId": "dest-project", "datasetId": "dest_dataset", @@ -971,6 +970,9 @@ def test_result_reloads_job_state_until_done(self): self.assertEqual(result.query, job.query) self.assertEqual(result.total_bytes_processed, 1234) self.assertEqual(result.slot_millis, 5678) + self.assertEqual(result.created.timestamp() * 1000, 11) + self.assertEqual(result.started.timestamp() * 1000, 22) + self.assertEqual(result.ended.timestamp() * 1000, 33) query_results_path = f"/projects/{self.PROJECT}/queries/{self.JOB_ID}" query_results_call = mock.call( @@ -1329,102 +1331,6 @@ def test_result_with_max_results(self): [jobs_get_call, query_page_waiting_call, query_page_2_call] ) - def test_result_w_custom_retry(self): - from google.cloud.bigquery.table import RowIterator - - query_resource = { - "jobComplete": False, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - query_resource_done = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - "totalRows": "2", - } - job_resource = self._make_resource(started=True, location="asia-northeast1") - job_resource_done = self._make_resource( - started=True, ended=True, location="asia-northeast1" - ) - job_resource_done["configuration"]["query"]["destinationTable"] = { - "projectId": "dest-project", - "datasetId": "dest_dataset", - "tableId": "dest_table", - } - - connection = make_connection( - # Also, for each API request, raise an exception that we know can - # be retried. Because of this, for each iteration we do: - # jobs.get (x2) & jobs.getQueryResults (x2) - exceptions.NotFound("not normally retriable"), - job_resource, - exceptions.NotFound("not normally retriable"), - query_resource, - # Query still not done, repeat both. - exceptions.NotFound("not normally retriable"), - job_resource, - exceptions.NotFound("not normally retriable"), - query_resource, - exceptions.NotFound("not normally retriable"), - # Query still not done, repeat both. - job_resource_done, - exceptions.NotFound("not normally retriable"), - query_resource_done, - # Query finished! - ) - client = _make_client(self.PROJECT, connection=connection) - job = self._get_target_class().from_api_repr(job_resource, client) - - custom_predicate = mock.Mock() - custom_predicate.return_value = True - custom_retry = google.api_core.retry.Retry( - initial=0.001, - maximum=0.001, - multiplier=1.0, - deadline=0.1, - predicate=custom_predicate, - ) - - self.assertIsInstance(job.result(retry=custom_retry), RowIterator) - query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={"maxResults": 0, "location": "asia-northeast1"}, - # TODO(tswast): Why do we end up setting timeout to - # google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT in - # some cases but not others? - timeout=mock.ANY, - ) - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "asia-northeast1"}, - timeout=DEFAULT_GET_JOB_TIMEOUT, - ) - - connection.api_request.assert_has_calls( - [ - # See make_connection() call above for explanation of the - # expected API calls. - # - # Query not done. - reload_call, - reload_call, - query_results_call, - query_results_call, - # Query still not done. - reload_call, - reload_call, - query_results_call, - query_results_call, - # Query done! - reload_call, - reload_call, - query_results_call, - query_results_call, - ] - ) - def test_result_w_empty_schema(self): from google.cloud.bigquery.table import _EmptyRowIterator @@ -1449,102 +1355,6 @@ def test_result_w_empty_schema(self): self.assertEqual(result.location, "asia-northeast1") self.assertEqual(result.query_id, "xyz-abc") - def test_result_w_timeout_doesnt_raise(self): - import google.cloud.bigquery.client - - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - job._properties["jobReference"]["location"] = "US" - job._properties["status"] = {"state": "RUNNING"} - - with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): - job.result( - # Test that fractional seconds are supported, but use a timeout - # that is representable as a floating point without rounding - # errors since it can be represented exactly in base 2. In this - # case 1.125 is 9 / 8, which is a fraction with a power of 2 in - # the denominator. - timeout=1.125, - ) - - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "US"}, - timeout=1.125, - ) - get_query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={ - "maxResults": 0, - "location": "US", - }, - timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, - ) - connection.api_request.assert_has_calls( - [ - reload_call, - get_query_results_call, - reload_call, - ] - ) - - def test_result_w_timeout_raises_concurrent_futures_timeout(self): - import google.cloud.bigquery.client - - begun_resource = self._make_resource() - begun_resource["jobReference"]["location"] = "US" - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, - } - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = make_connection(begun_resource, query_resource, done_resource) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - job._properties["jobReference"]["location"] = "US" - job._properties["status"] = {"state": "RUNNING"} - - with freezegun.freeze_time( - "1970-01-01 00:00:00", auto_tick_seconds=1.0 - ), self.assertRaises(concurrent.futures.TimeoutError): - job.result(timeout=1.125) - - reload_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{self.JOB_ID}", - query_params={"projection": "full", "location": "US"}, - timeout=1.125, - ) - get_query_results_call = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/queries/{self.JOB_ID}", - query_params={ - "maxResults": 0, - "location": "US", - }, - timeout=google.cloud.bigquery.client._MIN_GET_QUERY_RESULTS_TIMEOUT, - ) - connection.api_request.assert_has_calls( - [ - reload_call, - get_query_results_call, - # Timeout before we can reload with the final job state. - ] - ) - def test_result_w_page_size(self): # Arrange query_results_resource = { diff --git a/tests/unit/job/test_query_job_retry.py b/tests/unit/job/test_query_job_retry.py new file mode 100644 index 000000000..c8355b688 --- /dev/null +++ b/tests/unit/job/test_query_job_retry.py @@ -0,0 +1,229 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock + +import concurrent.futures +import freezegun +from google.api_core import exceptions +import google.api_core.retry +import pytest + +from google.cloud.bigquery.client import _MIN_GET_QUERY_RESULTS_TIMEOUT +from google.cloud.bigquery.job import QueryJob +from google.cloud.bigquery.retry import DEFAULT_GET_JOB_TIMEOUT +from google.cloud.bigquery.table import RowIterator + +from ..helpers import make_connection +from .helpers import _make_client + + +PROJECT = "test-project" +JOB_ID = "test-job-id" +QUERY = "select count(*) from persons" + + +def _make_resource(started=False, ended=False, location="US"): + resource = { + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID, "location": location}, + "status": {"state": "PENDING"}, + "configuration": { + "query": {"query": QUERY}, + "job_type": "query", + }, + "statistics": {"creationTime": "1"}, + } + + if started: + resource["status"]["state"] = "RUNNING" + resource["statistics"]["startTime"] = "2" + + if ended: + resource["status"]["state"] = "DONE" + resource["statistics"]["endTime"] = "3" + + return resource + + +def test_result_w_custom_retry(global_time_lock): + query_resource = { + "jobComplete": False, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + } + query_resource_done = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + "totalRows": "2", + } + job_resource = _make_resource(started=True, location="asia-northeast1") + job_resource_done = _make_resource( + started=True, ended=True, location="asia-northeast1" + ) + job_resource_done["configuration"]["query"]["destinationTable"] = { + "projectId": "dest-project", + "datasetId": "dest_dataset", + "tableId": "dest_table", + } + + connection = make_connection( + # Also, for each API request, raise an exception that we know can + # be retried. Because of this, for each iteration we do: + # jobs.get (x2) & jobs.getQueryResults (x2) + exceptions.NotFound("not normally retriable"), + job_resource, + exceptions.NotFound("not normally retriable"), + query_resource, + # Query still not done, repeat both. + exceptions.NotFound("not normally retriable"), + job_resource, + exceptions.NotFound("not normally retriable"), + query_resource, + exceptions.NotFound("not normally retriable"), + # Query still not done, repeat both. + job_resource_done, + exceptions.NotFound("not normally retriable"), + query_resource_done, + # Query finished! + ) + client = _make_client(PROJECT, connection=connection) + job = QueryJob.from_api_repr(job_resource, client) + + custom_predicate = mock.Mock() + custom_predicate.return_value = True + custom_retry = google.api_core.retry.Retry( + initial=0.001, + maximum=0.001, + multiplier=1.0, + deadline=0.1, + predicate=custom_predicate, + ) + + assert isinstance(job.result(retry=custom_retry), RowIterator) + query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={"maxResults": 0, "location": "asia-northeast1"}, + timeout=mock.ANY, + ) + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "asia-northeast1"}, + timeout=DEFAULT_GET_JOB_TIMEOUT, + ) + + connection.api_request.assert_has_calls( + [ + reload_call, + reload_call, + query_results_call, + query_results_call, + reload_call, + reload_call, + query_results_call, + query_results_call, + reload_call, + reload_call, + query_results_call, + query_results_call, + ] + ) + + +def test_result_w_timeout_doesnt_raise(global_time_lock): + begun_resource = _make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = begun_resource.copy() + done_resource["status"] = {"state": "DONE"} + connection = make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=PROJECT, connection=connection) + job = QueryJob(JOB_ID, QUERY, client) + job._properties["jobReference"]["location"] = "US" + job._properties["status"] = {"state": "RUNNING"} + + with freezegun.freeze_time("1970-01-01 00:00:00", tick=False): + job.result( + timeout=1.125, + ) + + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "US"}, + timeout=1.125, + ) + get_query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={ + "maxResults": 0, + "location": "US", + }, + timeout=_MIN_GET_QUERY_RESULTS_TIMEOUT, + ) + connection.api_request.assert_has_calls( + [ + reload_call, + get_query_results_call, + reload_call, + ] + ) + + +def test_result_w_timeout_raises_concurrent_futures_timeout(global_time_lock): + begun_resource = _make_resource() + begun_resource["jobReference"]["location"] = "US" + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": PROJECT, "jobId": JOB_ID}, + "schema": {"fields": [{"name": "col1", "type": "STRING"}]}, + } + done_resource = begun_resource.copy() + done_resource["status"] = {"state": "DONE"} + connection = make_connection(begun_resource, query_resource, done_resource) + client = _make_client(project=PROJECT, connection=connection) + job = QueryJob(JOB_ID, QUERY, client) + job._properties["jobReference"]["location"] = "US" + job._properties["status"] = {"state": "RUNNING"} + + with freezegun.freeze_time( + "1970-01-01 00:00:00", auto_tick_seconds=1.0 + ), pytest.raises(concurrent.futures.TimeoutError): + job.result(timeout=1.125) + + reload_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{JOB_ID}", + query_params={"projection": "full", "location": "US"}, + timeout=1.125, + ) + get_query_results_call = mock.call( + method="GET", + path=f"/projects/{PROJECT}/queries/{JOB_ID}", + query_params={ + "maxResults": 0, + "location": "US", + }, + timeout=_MIN_GET_QUERY_RESULTS_TIMEOUT, + ) + connection.api_request.assert_has_calls( + [ + reload_call, + get_query_results_call, + ] + ) diff --git a/tests/unit/test__job_helpers.py b/tests/unit/test__job_helpers.py index 417f911b8..1f543f033 100644 --- a/tests/unit/test__job_helpers.py +++ b/tests/unit/test__job_helpers.py @@ -15,7 +15,6 @@ from typing import Any, Dict, Optional from unittest import mock -import freezegun import google.api_core.exceptions from google.api_core import retry as retries import pytest @@ -450,110 +449,6 @@ def test_query_and_wait_uses_jobs_insert(): ) -def test_query_and_wait_retries_job(): - freezegun.freeze_time(auto_tick_seconds=100) - client = mock.create_autospec(Client) - client._call_api.__name__ = "_call_api" - client._call_api.__qualname__ = "Client._call_api" - client._call_api.__annotations__ = {} - client._call_api.__type_params__ = () - client._call_api.side_effect = ( - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - google.api_core.exceptions.BadGateway("retry me"), - { - "jobReference": { - "projectId": "response-project", - "jobId": "abc", - "location": "response-location", - }, - "jobComplete": True, - "schema": { - "fields": [ - {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "age", "type": "INT64", "mode": "NULLABLE"}, - ], - }, - "rows": [ - {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, - {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - ], - }, - ) - rows = _job_helpers.query_and_wait( - client, - query="SELECT 1", - location="request-location", - project="request-project", - job_config=None, - page_size=None, - max_results=None, - retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), - multiplier=1.0, - ).with_deadline( - 200.0 - ), # Since auto_tick_seconds is 100, we should get at least 1 retry. - job_retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.InternalServerError), - multiplier=1.0, - ).with_deadline(600.0), - ) - assert len(list(rows)) == 4 - - # For this code path, where the query has finished immediately, we should - # only be calling the jobs.query API and no other request path. - request_path = "/projects/request-project/queries" - for call in client._call_api.call_args_list: - _, kwargs = call - assert kwargs["method"] == "POST" - assert kwargs["path"] == request_path - - -@freezegun.freeze_time(auto_tick_seconds=100) -def test_query_and_wait_retries_job_times_out(): - client = mock.create_autospec(Client) - client._call_api.__name__ = "_call_api" - client._call_api.__qualname__ = "Client._call_api" - client._call_api.__annotations__ = {} - client._call_api.__type_params__ = () - client._call_api.side_effect = ( - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - google.api_core.exceptions.BadGateway("retry me"), - google.api_core.exceptions.InternalServerError("job_retry me"), - ) - - with pytest.raises(google.api_core.exceptions.RetryError) as exc_info: - _job_helpers.query_and_wait( - client, - query="SELECT 1", - location="request-location", - project="request-project", - job_config=None, - page_size=None, - max_results=None, - retry=retries.Retry( - lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), - multiplier=1.0, - ).with_deadline( - 200.0 - ), # Since auto_tick_seconds is 100, we should get at least 1 retry. - job_retry=retries.Retry( - lambda exc: isinstance( - exc, google.api_core.exceptions.InternalServerError - ), - multiplier=1.0, - ).with_deadline(400.0), - ) - - assert isinstance( - exc_info.value.cause, google.api_core.exceptions.InternalServerError - ) - - def test_query_and_wait_sets_job_creation_mode(): client = mock.create_autospec(Client) client.default_job_creation_mode = "JOB_CREATION_OPTIONAL" diff --git a/tests/unit/test__job_helpers_retry.py b/tests/unit/test__job_helpers_retry.py new file mode 100644 index 000000000..3ea4b1aae --- /dev/null +++ b/tests/unit/test__job_helpers_retry.py @@ -0,0 +1,122 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import freezegun +import google.api_core.exceptions +from google.api_core import retry as retries +import pytest + +from google.cloud.bigquery import _job_helpers + +from . import helpers + + +def test_query_and_wait_retries_job(global_time_lock): + with freezegun.freeze_time(auto_tick_seconds=100): + conn = helpers.make_connection( + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + google.api_core.exceptions.BadGateway("retry me"), + { + "jobReference": { + "projectId": "response-project", + "jobId": "abc", + "location": "response-location", + }, + "jobComplete": True, + "schema": { + "fields": [ + {"name": "full_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "age", "type": "INT64", "mode": "NULLABLE"}, + ], + }, + "rows": [ + {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, + {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ], + }, + ) + client = helpers.make_client(project="client-project") + client._connection = conn + rows = _job_helpers.query_and_wait( + client, + query="SELECT 1", + location="request-location", + project="request-project", + job_config=None, + page_size=None, + max_results=None, + retry=retries.Retry( + lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), + multiplier=1.0, + ).with_deadline( + 200.0 + ), # Since auto_tick_seconds is 100, we should get at least 1 retry. + job_retry=retries.Retry( + lambda exc: isinstance( + exc, google.api_core.exceptions.InternalServerError + ), + multiplier=1.0, + ).with_deadline(600.0), + ) + assert len(list(rows)) == 4 + + # For this code path, where the query has finished immediately, we should + # only be calling the jobs.query API and no other request path. + request_path = "/projects/request-project/queries" + for call in client._connection.api_request.call_args_list: + _, kwargs = call + assert kwargs["method"] == "POST" + assert kwargs["path"] == request_path + + +def test_query_and_wait_retries_job_times_out(global_time_lock): + with freezegun.freeze_time(auto_tick_seconds=100): + conn = helpers.make_connection( + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + google.api_core.exceptions.BadGateway("retry me"), + google.api_core.exceptions.InternalServerError("job_retry me"), + ) + client = helpers.make_client(project="client-project") + client._connection = conn + + with pytest.raises(google.api_core.exceptions.RetryError) as exc_info: + _job_helpers.query_and_wait( + client, + query="SELECT 1", + location="request-location", + project="request-project", + job_config=None, + page_size=None, + max_results=None, + retry=retries.Retry( + lambda exc: isinstance(exc, google.api_core.exceptions.BadGateway), + multiplier=1.0, + ).with_deadline( + 200.0 + ), # Since auto_tick_seconds is 100, we should get at least 1 retry. + job_retry=retries.Retry( + lambda exc: isinstance( + exc, google.api_core.exceptions.InternalServerError + ), + multiplier=1.0, + ).with_deadline(400.0), + ) + + assert isinstance( + exc_info.value.cause, google.api_core.exceptions.InternalServerError + ) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index bb86ccc3c..213f382dc 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -16,7 +16,6 @@ import collections import datetime import decimal -import email import gzip import http.client import io @@ -28,13 +27,10 @@ from unittest import mock import warnings -import freezegun import packaging import pytest import requests -import google.api - try: import opentelemetry @@ -58,8 +54,6 @@ import google.cloud._helpers from google.cloud import bigquery -from google.cloud.bigquery import job as bqjob -import google.cloud.bigquery._job_helpers from google.cloud.bigquery.dataset import DatasetReference, Dataset from google.cloud.bigquery.enums import UpdateMode, DatasetView from google.cloud.bigquery import exceptions @@ -313,31 +307,6 @@ def test__call_api_extra_headers(self): headers = kwargs["headers"] assert headers["x-goog-request-reason"] == "because-friday" - def test__call_api_applying_custom_retry_on_timeout(self): - from concurrent.futures import TimeoutError - from google.cloud.bigquery.retry import DEFAULT_RETRY - - creds = _make_credentials() - client = self._make_one(project=self.PROJECT, credentials=creds) - - api_request_patcher = mock.patch.object( - client._connection, - "api_request", - side_effect=[TimeoutError, "result"], - ) - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, TimeoutError) - ) - - with api_request_patcher as fake_api_request: - result = client._call_api(retry, foo="bar") - - self.assertEqual(result, "result") - self.assertEqual( - fake_api_request.call_args_list, - [mock.call(foo="bar"), mock.call(foo="bar")], # was retried once - ) - def test__call_api_span_creator_not_called(self): from concurrent.futures import TimeoutError from google.cloud.bigquery.retry import DEFAULT_RETRY @@ -644,48 +613,6 @@ def test_get_service_account_email_w_alternate_project(self): ) self.assertEqual(service_account_email, email) - def test_get_service_account_email_w_custom_retry(self): - from google.cloud.bigquery.retry import DEFAULT_RETRY - - api_path = "/projects/{}/serviceAccount".format(self.PROJECT) - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - - resource = { - "kind": "bigquery#getServiceAccountResponse", - "email": "bq-123@bigquery-encryption.iam.gserviceaccount.com", - } - api_request_patcher = mock.patch.object( - client._connection, - "api_request", - side_effect=[ValueError, resource], - ) - - retry = DEFAULT_RETRY.with_deadline(1).with_predicate( - lambda exc: isinstance(exc, ValueError) - ) - - with api_request_patcher as fake_api_request: - with mock.patch( - "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" - ) as final_attributes: - service_account_email = client.get_service_account_email( - retry=retry, timeout=7.5 - ) - - final_attributes.assert_called_once_with({"path": api_path}, client, None) - self.assertEqual( - service_account_email, "bq-123@bigquery-encryption.iam.gserviceaccount.com" - ) - self.assertEqual( - fake_api_request.call_args_list, - [ - mock.call(method="GET", path=api_path, timeout=7.5), - mock.call(method="GET", path=api_path, timeout=7.5), # was retried once - ], - ) - def test_dataset_with_specified_project(self): from google.cloud.bigquery.dataset import DatasetReference @@ -3848,176 +3775,6 @@ def test_load_table_from_uri_w_default_load_config(self): timeout=DEFAULT_TIMEOUT, ) - @staticmethod - def _mock_requests_response(status_code, headers, content=b""): - return mock.Mock( - content=content, - headers=headers, - status_code=status_code, - spec=["content", "headers", "status_code"], - ) - - def _mock_transport(self, status_code, headers, content=b""): - fake_transport = mock.Mock(spec=["request"]) - fake_response = self._mock_requests_response( - status_code, headers, content=content - ) - fake_transport.request.return_value = fake_response - return fake_transport - - def _initiate_resumable_upload_helper(self, num_retries=None, mtls=False): - from google.resumable_media.requests import ResumableUpload - from google.cloud.bigquery.client import _DEFAULT_CHUNKSIZE - from google.cloud.bigquery.client import _GENERIC_CONTENT_TYPE - from google.cloud.bigquery.client import _get_upload_headers - from google.cloud.bigquery.job import LoadJob - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.job import SourceFormat - - # Create mocks to be checked for doing transport. - resumable_url = "http://test.invalid?upload_id=hey-you" - response_headers = {"location": resumable_url} - fake_transport = self._mock_transport(http.client.OK, response_headers) - client = self._make_one(project=self.PROJECT, _http=fake_transport) - conn = client._connection = make_connection() - if mtls: - conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") - - # Create some mock arguments and call the method under test. - data = b"goodbye gudbi gootbee" - stream = io.BytesIO(data) - config = LoadJobConfig() - config.source_format = SourceFormat.CSV - job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) - metadata = job.to_api_repr() - upload, transport = client._initiate_resumable_upload( - stream, metadata, num_retries, None - ) - - # Check the returned values. - self.assertIsInstance(upload, ResumableUpload) - - host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" - upload_url = ( - f"{host_name}/upload/bigquery/v2/projects/{self.PROJECT}" - "/jobs?uploadType=resumable" - ) - self.assertEqual(upload.upload_url, upload_url) - expected_headers = _get_upload_headers(conn.user_agent) - self.assertEqual(upload._headers, expected_headers) - self.assertFalse(upload.finished) - self.assertEqual(upload._chunk_size, _DEFAULT_CHUNKSIZE) - self.assertIs(upload._stream, stream) - self.assertIsNone(upload._total_bytes) - self.assertEqual(upload._content_type, _GENERIC_CONTENT_TYPE) - self.assertEqual(upload.resumable_url, resumable_url) - - retry_strategy = upload._retry_strategy - self.assertEqual(retry_strategy.max_sleep, 64.0) - if num_retries is None: - self.assertEqual(retry_strategy.max_cumulative_retry, 600.0) - self.assertIsNone(retry_strategy.max_retries) - else: - self.assertIsNone(retry_strategy.max_cumulative_retry) - self.assertEqual(retry_strategy.max_retries, num_retries) - self.assertIs(transport, fake_transport) - # Make sure we never read from the stream. - self.assertEqual(stream.tell(), 0) - - # Check the mocks. - request_headers = expected_headers.copy() - request_headers["x-upload-content-type"] = _GENERIC_CONTENT_TYPE - fake_transport.request.assert_called_once_with( - "POST", - upload_url, - data=json.dumps(metadata).encode("utf-8"), - headers=request_headers, - timeout=mock.ANY, - ) - - def test__initiate_resumable_upload(self): - self._initiate_resumable_upload_helper() - - def test__initiate_resumable_upload_mtls(self): - self._initiate_resumable_upload_helper(mtls=True) - - def test__initiate_resumable_upload_with_retry(self): - self._initiate_resumable_upload_helper(num_retries=11) - - def _do_multipart_upload_success_helper( - self, get_boundary, num_retries=None, project=None, mtls=False - ): - from google.cloud.bigquery.client import _get_upload_headers - from google.cloud.bigquery.job import LoadJob - from google.cloud.bigquery.job import LoadJobConfig - from google.cloud.bigquery.job import SourceFormat - - fake_transport = self._mock_transport(http.client.OK, {}) - client = self._make_one(project=self.PROJECT, _http=fake_transport) - conn = client._connection = make_connection() - if mtls: - conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") - - if project is None: - project = self.PROJECT - - # Create some mock arguments. - data = b"Bzzzz-zap \x00\x01\xf4" - stream = io.BytesIO(data) - config = LoadJobConfig() - config.source_format = SourceFormat.CSV - job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) - metadata = job.to_api_repr() - size = len(data) - - response = client._do_multipart_upload( - stream, metadata, size, num_retries, None, project=project - ) - - # Check the mocks and the returned value. - self.assertIs(response, fake_transport.request.return_value) - self.assertEqual(stream.tell(), size) - get_boundary.assert_called_once_with() - - host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" - upload_url = ( - f"{host_name}/upload/bigquery/v2/projects/{project}" - "/jobs?uploadType=multipart" - ) - payload = ( - b"--==0==\r\n" - b"content-type: application/json; charset=UTF-8\r\n\r\n" - b"%(json_metadata)s" - b"\r\n" - b"--==0==\r\n" - b"content-type: */*\r\n\r\n" - b"%(data)s" - b"\r\n" - b"--==0==--" - ) % {b"json_metadata": json.dumps(metadata).encode("utf-8"), b"data": data} - - headers = _get_upload_headers(conn.user_agent) - headers["content-type"] = b'multipart/related; boundary="==0=="' - fake_transport.request.assert_called_once_with( - "POST", upload_url, data=payload, headers=headers, timeout=mock.ANY - ) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_mtls(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, mtls=True) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_with_retry(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, num_retries=8) - - @mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") - def test__do_multipart_upload_with_custom_project(self, get_boundary): - self._do_multipart_upload_success_helper(get_boundary, project="custom-project") - def test_copy_table(self): from google.cloud.bigquery.job import CopyJob @@ -5543,143 +5300,6 @@ def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails_no_retries(self job_retry=None, ) - def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404(self): - """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 - - Sometimes after a Conflict, the fetch fails with a 404, but we know - because of the conflict that really the job does exist. Retry until we - get the job status (or timeout). - """ - job_id = "abc123" - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - conn = client._connection = make_connection( - # We're mocking QueryJob._begin, so this is only going to be - # jobs.get requests and responses. - google.api_core.exceptions.TooManyRequests("this is retriable by default"), - google.api_core.exceptions.NotFound("we lost your job"), - google.api_core.exceptions.NotFound("we lost your job again, sorry"), - { - "jobReference": { - "projectId": self.PROJECT, - "location": "TESTLOC", - "jobId": job_id, - } - }, - ) - - job_create_error = google.api_core.exceptions.Conflict("Job already exists.") - job_begin_patcher = mock.patch.object( - bqjob.QueryJob, "_begin", side_effect=job_create_error - ) - job_id_patcher = mock.patch.object( - google.cloud.bigquery._job_helpers, - "make_job_id", - return_value=job_id, - ) - - with job_begin_patcher, job_id_patcher: - # If get job request fails there does exist a job - # with this ID already, retry 404 until we get it (or fails for a - # non-retriable reason, see other tests). - result = client.query("SELECT 1;", job_id=None) - - jobs_get_path = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{job_id}", - query_params={ - "projection": "full", - }, - timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, - ) - conn.api_request.assert_has_calls( - # Double-check that it was jobs.get that was called for each of our - # mocked responses. - [jobs_get_path] - * 4, - ) - assert result.job_id == job_id - - def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404_and_query_job_insert( - self, - ): - """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 - - Sometimes after a Conflict, the fetch fails with a 404. If it keeps - failing with a 404, assume that the job actually doesn't exist. - """ - job_id_1 = "abc123" - job_id_2 = "xyz789" - creds = _make_credentials() - http = object() - client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) - - # We're mocking QueryJob._begin, so that the connection should only get - # jobs.get requests. - job_create_error = google.api_core.exceptions.Conflict("Job already exists.") - job_begin_patcher = mock.patch.object( - bqjob.QueryJob, "_begin", side_effect=job_create_error - ) - conn = client._connection = make_connection( - google.api_core.exceptions.NotFound("we lost your job again, sorry"), - { - "jobReference": { - "projectId": self.PROJECT, - "location": "TESTLOC", - "jobId": job_id_2, - } - }, - ) - - # Choose a small deadline so the 404 retries give up. - retry = ( - google.cloud.bigquery.retry._DEFAULT_GET_JOB_CONFLICT_RETRY.with_deadline(1) - ) - job_id_patcher = mock.patch.object( - google.cloud.bigquery._job_helpers, - "make_job_id", - side_effect=[job_id_1, job_id_2], - ) - retry_patcher = mock.patch.object( - google.cloud.bigquery.retry, - "_DEFAULT_GET_JOB_CONFLICT_RETRY", - retry, - ) - - with freezegun.freeze_time( - "2025-01-01 00:00:00", - # 10x the retry deadline to guarantee a timeout. - auto_tick_seconds=10, - ), job_begin_patcher, job_id_patcher, retry_patcher: - # If get job request fails there does exist a job - # with this ID already, retry 404 until we get it (or fails for a - # non-retriable reason, see other tests). - result = client.query("SELECT 1;", job_id=None) - - jobs_get_path_1 = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{job_id_1}", - query_params={ - "projection": "full", - }, - timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, - ) - jobs_get_path_2 = mock.call( - method="GET", - path=f"/projects/{self.PROJECT}/jobs/{job_id_2}", - query_params={ - "projection": "full", - }, - timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, - ) - conn.api_request.assert_has_calls( - # Double-check that it was jobs.get that was called for each of our - # mocked responses. - [jobs_get_path_1, jobs_get_path_2], - ) - assert result.job_id == job_id_2 - def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_succeeds(self): from google.api_core.exceptions import Conflict from google.cloud.bigquery.job import QueryJob @@ -5719,6 +5339,9 @@ def test_query_and_wait_defaults(self): "queryId": "job_abcDEF_", "totalBytesProcessed": 1234, "totalSlotMs": 5678, + "creationTime": "1437767599006", + "startTime": "1437767600007", + "endTime": "1437767601008", } creds = _make_credentials() http = object() @@ -5737,6 +5360,9 @@ def test_query_and_wait_defaults(self): self.assertEqual(rows.query, query) self.assertEqual(rows.total_bytes_processed, 1234) self.assertEqual(rows.slot_millis, 5678) + self.assertEqual(rows.created.timestamp() * 1000, 1437767599006) + self.assertEqual(rows.started.timestamp() * 1000, 1437767600007) + self.assertEqual(rows.ended.timestamp() * 1000, 1437767601008) # Verify the request we send is to jobs.query. conn.api_request.assert_called_once() @@ -10026,213 +9652,6 @@ def test_load_table_from_json_unicode_emoji_data_case(self): assert sent_data_file.getvalue() == expected_bytes # Low-level tests - - @classmethod - def _make_resumable_upload_responses(cls, size): - """Make a series of responses for a successful resumable upload.""" - from google import resumable_media - - resumable_url = "http://test.invalid?upload_id=and-then-there-was-1" - initial_response = cls._make_response( - http.client.OK, "", {"location": resumable_url} - ) - data_response = cls._make_response( - resumable_media.PERMANENT_REDIRECT, - "", - {"range": "bytes=0-{:d}".format(size - 1)}, - ) - final_response = cls._make_response( - http.client.OK, - json.dumps({"size": size}), - {"Content-Type": "application/json"}, - ) - return [initial_response, data_response, final_response] - - @staticmethod - def _make_transport(responses=None): - import google.auth.transport.requests - - transport = mock.create_autospec( - google.auth.transport.requests.AuthorizedSession, instance=True - ) - transport.request.side_effect = responses - return transport - - def test__do_resumable_upload(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - result = client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, None, None - ) - - content = result.content.decode("utf-8") - assert json.loads(content) == {"size": file_obj_len} - - # Verify that configuration data was passed in with the initial - # request. - transport.request.assert_any_call( - "POST", - mock.ANY, - data=json.dumps(self.EXPECTED_CONFIGURATION).encode("utf-8"), - headers=mock.ANY, - timeout=mock.ANY, - ) - - def test__do_resumable_upload_custom_project(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - result = client._do_resumable_upload( - file_obj, - self.EXPECTED_CONFIGURATION, - None, - None, - project="custom-project", - ) - - content = result.content.decode("utf-8") - assert json.loads(content) == {"size": file_obj_len} - - # Verify that configuration data was passed in with the initial - # request. - transport.request.assert_any_call( - "POST", - mock.ANY, - data=json.dumps(self.EXPECTED_CONFIGURATION).encode("utf-8"), - headers=mock.ANY, - timeout=mock.ANY, - ) - - initiation_url = next( - ( - call[0][1] - for call in transport.request.call_args_list - if call[0][0] == "POST" and "uploadType=resumable" in call[0][1] - ), - None, - ) # pragma: NO COVER - - assert initiation_url is not None - assert "projects/custom-project" in initiation_url - - def test__do_resumable_upload_custom_timeout(self): - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - transport = self._make_transport( - self._make_resumable_upload_responses(file_obj_len) - ) - client = self._make_client(transport) - - client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, num_retries=0, timeout=3.14 - ) - - # The timeout should be applied to all underlying calls. - for call_args in transport.request.call_args_list: - assert call_args[1].get("timeout") == 3.14 - - def test__do_multipart_upload(self): - transport = self._make_transport([self._make_response(http.client.OK)]) - client = self._make_client(transport) - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - - client._do_multipart_upload( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None, None - ) - - # Verify that configuration data was passed in with the initial - # request. - request_args = transport.request.mock_calls[0][2] - request_data = request_args["data"].decode("utf-8") - request_headers = request_args["headers"] - - request_content = email.message_from_string( - "Content-Type: {}\r\n{}".format( - request_headers["content-type"].decode("utf-8"), request_data - ) - ) - - # There should be two payloads: the configuration and the binary daya. - configuration_data = request_content.get_payload(0).get_payload() - binary_data = request_content.get_payload(1).get_payload() - - assert json.loads(configuration_data) == self.EXPECTED_CONFIGURATION - assert binary_data.encode("utf-8") == file_obj.getvalue() - - def test__do_multipart_upload_wrong_size(self): - client = self._make_client() - file_obj = self._make_file_obj() - file_obj_len = len(file_obj.getvalue()) - - with pytest.raises(ValueError): - client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) - - def test_schema_from_json_with_file_path(self): - from google.cloud.bigquery.schema import SchemaField - - file_content = """[ - { - "description": "quarter", - "mode": "REQUIRED", - "name": "qtr", - "type": "STRING" - }, - { - "description": "sales representative", - "mode": "NULLABLE", - "name": "rep", - "type": "STRING" - }, - { - "description": "total sales", - "mode": "NULLABLE", - "name": "sales", - "type": "FLOAT" - } - ]""" - - expected = [ - SchemaField("qtr", "STRING", "REQUIRED", description="quarter"), - SchemaField( - "rep", - "STRING", - "NULLABLE", - description="sales representative", - ), - SchemaField( - "sales", - "FLOAT", - "NULLABLE", - description="total sales", - ), - ] - - client = self._make_client() - mock_file_path = "/mocked/file.json" - - open_patch = mock.patch( - "builtins.open", new=mock.mock_open(read_data=file_content) - ) - - with open_patch as _mock_file: - actual = client.schema_from_json(mock_file_path) - _mock_file.assert_called_once_with(mock_file_path) - # This assert is to make sure __exit__ is called in the context - # manager that opens the file in the function - _mock_file().__exit__.assert_called_once() - - assert expected == actual - def test_schema_from_json_with_file_object(self): from google.cloud.bigquery.schema import SchemaField diff --git a/tests/unit/test_client_bigframes.py b/tests/unit/test_client_bigframes.py new file mode 100644 index 000000000..0260da5e4 --- /dev/null +++ b/tests/unit/test_client_bigframes.py @@ -0,0 +1,411 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Client features enabling the bigframes integration.""" + +from __future__ import annotations + +import datetime +from unittest import mock + +import pytest + +import google.auth.credentials +from google.api_core import exceptions +from google.cloud import bigquery +import google.cloud.bigquery.client +from google.cloud.bigquery import _job_helpers + + +PROJECT = "test-project" +LOCATION = "test-location" + + +def make_response(body, *, status_code: int = 200): + response = mock.Mock() + type(response).status_code = mock.PropertyMock(return_value=status_code) + response.json.return_value = body + return response + + +@pytest.fixture +def client(): + """A real client object with mocked API requests.""" + credentials = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + http_session = mock.Mock() + return google.cloud.bigquery.client.Client( + project=PROJECT, + credentials=credentials, + _http=http_session, + location=LOCATION, + ) + + +def test_query_and_wait_bigframes_dry_run_no_callback(client): + client._http.request.side_effect = [ + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalBytesProcessed": "123", + "jobComplete": True, + } + ), + ] + callback = mock.Mock() + job_config = bigquery.QueryJobConfig(dry_run=True) + response = client._query_and_wait_bigframes( + query="SELECT 1", job_config=job_config, callback=callback + ) + callback.assert_not_called() + assert response.total_bytes_processed == 123 + assert response.query_id == "abcdefg" + + +def test_query_and_wait_bigframes_callback(client): + created = datetime.datetime( + 2025, 8, 18, 10, 11, 12, 345000, tzinfo=datetime.timezone.utc + ) + started = datetime.datetime( + 2025, 8, 18, 10, 11, 13, 456000, tzinfo=datetime.timezone.utc + ) + ended = datetime.datetime( + 2025, 8, 18, 10, 11, 14, 567000, tzinfo=datetime.timezone.utc + ) + client._http.request.side_effect = [ + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalRows": "100", + "totalBytesProcessed": "123", + "totalSlotMs": "987", + "jobComplete": True, + "creationTime": _to_millis(created), + "startTime": _to_millis(started), + "endTime": _to_millis(ended), + } + ), + ] + callback = mock.Mock() + client._query_and_wait_bigframes(query="SELECT 1", callback=callback) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project=PROJECT, + location=LOCATION, + query_id="abcdefg", + total_rows=100, + total_bytes_processed=123, + slot_millis=987, + created=created, + started=started, + ended=ended, + # No job ID or destination, because a basic query is eligible for jobs.query. + job_id=None, + destination=None, + ), + ), + ] + ) + + +def _to_millis(dt: datetime.datetime) -> str: + return str( + int( + (dt - datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)) + / datetime.timedelta(milliseconds=1) + ) + ) + + +def test_query_and_wait_bigframes_with_jobs_insert_callback_empty_results(client): + client._http.request.side_effect = [ + # jobs.insert because destination table present in job_config + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + # "queryPlan": [{"name": "part1"}, {"name": "part2"}], + }, + }, + "status": { + "state": "PENDING", + }, + } + ), + # jobs.get waiting for query to finish + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/insert + # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "status": { + "state": "PENDING", + }, + } + ), + # jobs.getQueryResults with max_results=0 + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "jobComplete": True, + # totalRows is intentionally missing so we end up in the _EmptyRowIterator code path. + } + ), + # jobs.get + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "startTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 32, 123000, tzinfo=datetime.timezone.utc + ) + ), + "endTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 33, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + "totalBytesProcessed": 123, + "totalSlotMs": 987, + }, + }, + "status": {"state": "DONE"}, + } + ), + ] + callback = mock.Mock() + config = bigquery.QueryJobConfig() + config.destination = "proj.dset.table" + client._query_and_wait_bigframes( + query="SELECT 1", job_config=config, callback=callback + ) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project="response-project", + location="response-location", + job_id="response-job-id", + # We use jobs.insert not jobs.query because destination is + # present on job_config. + request_id=None, + ) + ), + mock.call( + _job_helpers.QueryReceivedEvent( + billing_project="response-project", + location="response-location", + job_id="response-job-id", + statement_type="SELECT", + state="PENDING", + query_plan=[], + created=datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ), + started=None, + ended=None, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project="response-project", + location="response-location", + job_id="response-job-id", + query_id=None, + total_rows=0, + total_bytes_processed=123, + slot_millis=987, + created=datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ), + started=datetime.datetime( + 2025, 8, 13, 13, 7, 32, 123000, tzinfo=datetime.timezone.utc + ), + ended=datetime.datetime( + 2025, 8, 13, 13, 7, 33, 123000, tzinfo=datetime.timezone.utc + ), + destination=None, + ), + ), + ] + ) + + +def test_query_and_wait_bigframes_with_jobs_insert_dry_run_no_callback(client): + client._http.request.side_effect = [ + # jobs.insert because destination table present in job_config + make_response( + { + "jobReference": { + "projectId": "response-project", + "jobId": "response-job-id", + "location": "response-location", + }, + "statistics": { + "creationTime": _to_millis( + datetime.datetime( + 2025, 8, 13, 13, 7, 31, 123000, tzinfo=datetime.timezone.utc + ) + ), + "query": { + "statementType": "SELECT", + "totalBytesProcessed": 123, + "schema": { + "fields": [ + {"name": "_f0", "type": "INTEGER"}, + ], + }, + }, + }, + "configuration": { + "dryRun": True, + }, + "status": {"state": "DONE"}, + } + ), + ] + callback = mock.Mock() + config = bigquery.QueryJobConfig() + config.destination = "proj.dset.table" + config.dry_run = True + result = client._query_and_wait_bigframes( + query="SELECT 1", job_config=config, callback=callback + ) + callback.assert_not_called() + assert result.total_bytes_processed == 123 + assert result.schema == [bigquery.SchemaField("_f0", "INTEGER")] + + +def test_query_and_wait_bigframes_with_query_retry_callbacks(client, global_time_lock): + created = datetime.datetime( + 2025, 8, 18, 10, 11, 12, 345000, tzinfo=datetime.timezone.utc + ) + started = datetime.datetime( + 2025, 8, 18, 10, 11, 13, 456000, tzinfo=datetime.timezone.utc + ) + ended = datetime.datetime( + 2025, 8, 18, 10, 11, 14, 567000, tzinfo=datetime.timezone.utc + ) + client._http.request.side_effect = [ + exceptions.InternalServerError( + "first try", errors=({"reason": "jobInternalError"},) + ), + make_response( + { + # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query + "location": LOCATION, + "queryId": "abcdefg", + "totalRows": "100", + "totalBytesProcessed": "123", + "totalSlotMs": "987", + "jobComplete": True, + "creationTime": _to_millis(created), + "startTime": _to_millis(started), + "endTime": _to_millis(ended), + } + ), + ] + callback = mock.Mock() + client._query_and_wait_bigframes(query="SELECT 1", callback=callback) + callback.assert_has_calls( + [ + mock.call( + _job_helpers.QuerySentEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryRetryEvent( + query="SELECT 1", + billing_project=PROJECT, + location=LOCATION, + # No job ID, because a basic query is eligible for jobs.query. + job_id=None, + request_id=mock.ANY, + ) + ), + mock.call( + _job_helpers.QueryFinishedEvent( + billing_project=PROJECT, + location=LOCATION, + query_id=mock.ANY, + total_rows=100, + total_bytes_processed=123, + slot_millis=987, + created=created, + started=started, + ended=ended, + # No job ID or destination, because a basic query is eligible for jobs.query. + job_id=None, + destination=None, + ), + ), + ] + ) diff --git a/tests/unit/test_client_resumable_media_upload.py b/tests/unit/test_client_resumable_media_upload.py new file mode 100644 index 000000000..642c18d15 --- /dev/null +++ b/tests/unit/test_client_resumable_media_upload.py @@ -0,0 +1,433 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock +import email +import http.client +import io +import json + +import pytest + +from google.cloud.bigquery.table import TableReference + +from .helpers import make_connection + + +PROJECT = "test-project" +TABLE_REF = TableReference.from_string(f"{PROJECT}.test_dataset.test_table") +EXPECTED_CONFIGURATION = { + "load": { + "destinationTable": { + "projectId": PROJECT, + "datasetId": "test_dataset", + "tableId": "test_table", + }, + "sourceFormat": "CSV", + } +} + + +@pytest.fixture(autouse=True) +def mock_sleep(monkeypatch): + sleep = mock.Mock() + monkeypatch.setattr("time.sleep", sleep) + + +def _make_credentials(): + import google.auth.credentials + + return mock.Mock(spec=google.auth.credentials.Credentials) + + +def _make_client(*args, **kw): + from google.cloud.bigquery.client import Client + + kw["credentials"] = _make_credentials() + kw["project"] = PROJECT + return Client(*args, **kw) + + +def _make_file_obj(contents=b"some data"): + return io.BytesIO(contents) + + +def _make_response(status_code, content=b"", headers=None): + response = mock.Mock(spec=["status_code", "content", "request", "headers"]) + response.status_code = status_code + response.content = content + response.headers = headers or {} + response.request = mock.Mock(spec=["headers"]) + return response + + +def _make_resumable_upload_responses(num_bytes): + # In a real scenario, the upload URL is returned in a 'Location' + # header. + return [ + _make_response( + http.client.OK, + headers={"location": "http://test.invalid/upload-id"}, + ), + _make_response( + http.client.OK, content=json.dumps({"size": num_bytes}).encode("utf-8") + ), + ] + + +def _make_transport(responses=None): + import google.auth.transport.requests + + transport = mock.create_autospec( + google.auth.transport.requests.AuthorizedSession, instance=True + ) + transport.request.side_effect = responses + return transport + + +def _mock_requests_response(status_code, headers, content=b""): + return mock.Mock( + content=content, + headers=headers, + status_code=status_code, + spec=["content", "headers", "status_code"], + ) + + +def _mock_transport(status_code, headers, content=b""): + fake_transport = mock.Mock(spec=["request"]) + fake_response = _mock_requests_response(status_code, headers, content=content) + fake_transport.request.return_value = fake_response + return fake_transport + + +def _initiate_resumable_upload_helper(num_retries=None, mtls=False): + from google.resumable_media.requests import ResumableUpload + from google.cloud.bigquery.client import _DEFAULT_CHUNKSIZE + from google.cloud.bigquery.client import _GENERIC_CONTENT_TYPE + from google.cloud.bigquery.client import _get_upload_headers + from google.cloud.bigquery.job import LoadJob + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.job import SourceFormat + + # Create mocks to be checked for doing transport. + resumable_url = "http://test.invalid?upload_id=hey-you" + response_headers = {"location": resumable_url} + fake_transport = _mock_transport(http.client.OK, response_headers) + client = _make_client(_http=fake_transport) + conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") + + # Create some mock arguments and call the method under test. + data = b"goodbye gudbi gootbee" + stream = io.BytesIO(data) + config = LoadJobConfig() + config.source_format = SourceFormat.CSV + job = LoadJob(None, None, TABLE_REF, client, job_config=config) + metadata = job.to_api_repr() + upload, transport_out = client._initiate_resumable_upload( + stream, metadata, num_retries, None + ) + + # Check the returned values. + assert isinstance(upload, ResumableUpload) + + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" + upload_url = ( + f"{host_name}/upload/bigquery/v2/projects/{PROJECT}/jobs?uploadType=resumable" + ) + assert upload.upload_url == upload_url + expected_headers = _get_upload_headers(conn.user_agent) + assert upload._headers == expected_headers + assert not upload.finished + assert upload._chunk_size == _DEFAULT_CHUNKSIZE + assert upload._stream is stream + assert upload._total_bytes is None + assert upload._content_type == _GENERIC_CONTENT_TYPE + assert upload.resumable_url == resumable_url + + retry_strategy = upload._retry_strategy + assert retry_strategy.max_sleep == 64.0 + if num_retries is None: + assert retry_strategy.max_cumulative_retry == 600.0 + assert retry_strategy.max_retries is None + else: + assert retry_strategy.max_cumulative_retry is None + assert retry_strategy.max_retries == num_retries + assert transport_out is fake_transport + # Make sure we never read from the stream. + assert stream.tell() == 0 + + # Check the mocks. + request_headers = expected_headers.copy() + request_headers["x-upload-content-type"] = _GENERIC_CONTENT_TYPE + fake_transport.request.assert_called_once_with( + "POST", + upload_url, + data=json.dumps(metadata).encode("utf-8"), + headers=request_headers, + timeout=mock.ANY, + ) + + +def test__initiate_resumable_upload(): + _initiate_resumable_upload_helper() + + +def test__initiate_resumable_upload_mtls(): + _initiate_resumable_upload_helper(mtls=True) + + +def test_initiate_resumable_upload_with_retry(): + _initiate_resumable_upload_helper(num_retries=11) + + +def _do_multipart_upload_success_helper( + get_boundary, num_retries=None, project=None, mtls=False +): + from google.cloud.bigquery.client import _get_upload_headers + from google.cloud.bigquery.job import LoadJob + from google.cloud.bigquery.job import LoadJobConfig + from google.cloud.bigquery.job import SourceFormat + + fake_transport = _mock_transport(http.client.OK, {}) + client = _make_client(_http=fake_transport) + conn = client._connection = make_connection() + if mtls: + conn.get_api_base_url_for_mtls = mock.Mock(return_value="https://foo.mtls") + + if project is None: + project = PROJECT + + # Create some mock arguments. + data = b"Bzzzz-zap \x00\x01\xf4" + stream = io.BytesIO(data) + config = LoadJobConfig() + config.source_format = SourceFormat.CSV + job = LoadJob(None, None, TABLE_REF, client, job_config=config) + metadata = job.to_api_repr() + size = len(data) + + response = client._do_multipart_upload( + stream, metadata, size, num_retries, None, project=project + ) + + # Check the mocks and the returned value. + assert response is fake_transport.request.return_value + assert stream.tell() == size + get_boundary.assert_called_once_with() + + host_name = "https://foo.mtls" if mtls else "https://bigquery.googleapis.com" + upload_url = ( + f"{host_name}/upload/bigquery/v2/projects/{project}/jobs?uploadType=multipart" + ) + payload = ( + b"--==0==\r\n" + b"content-type: application/json; charset=UTF-8\r\n\r\n" + b"%(json_metadata)s" + b"\r\n" + b"--==0==\r\n" + b"content-type: */*\r\n\r\n" + b"%(data)s" + b"\r\n" + b"--==0==--" + ) % {b"json_metadata": json.dumps(metadata).encode("utf-8"), b"data": data} + + headers = _get_upload_headers(conn.user_agent) + headers["content-type"] = b'multipart/related; boundary="==0=="' + fake_transport.request.assert_called_once_with( + "POST", upload_url, data=payload, headers=headers, timeout=mock.ANY + ) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload(get_boundary): + _do_multipart_upload_success_helper(get_boundary) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload_mtls(get_boundary): + _do_multipart_upload_success_helper(get_boundary, mtls=True) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test_do_multipart_upload_with_retry(get_boundary): + _do_multipart_upload_success_helper(get_boundary, num_retries=8) + + +@mock.patch("google.resumable_media._upload.get_boundary", return_value=b"==0==") +def test__do_multipart_upload_with_custom_project(get_boundary): + _do_multipart_upload_success_helper(get_boundary, project="custom-project") + + +def test__do_resumable_upload(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + result = client._do_resumable_upload(file_obj, EXPECTED_CONFIGURATION, None, None) + + content = result.content.decode("utf-8") + assert json.loads(content) == {"size": file_obj_len} + + transport.request.assert_any_call( + "POST", + mock.ANY, + data=json.dumps(EXPECTED_CONFIGURATION).encode("utf-8"), + headers=mock.ANY, + timeout=mock.ANY, + ) + + +def test__do_resumable_upload_custom_project(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + result = client._do_resumable_upload( + file_obj, + EXPECTED_CONFIGURATION, + None, + None, + project="custom-project", + ) + + content = result.content.decode("utf-8") + assert json.loads(content) == {"size": file_obj_len} + + transport.request.assert_any_call( + "POST", + mock.ANY, + data=json.dumps(EXPECTED_CONFIGURATION).encode("utf-8"), + headers=mock.ANY, + timeout=mock.ANY, + ) + + initiation_url = next( + ( + call[0][1] + for call in transport.request.call_args_list + if call[0][0] == "POST" and "uploadType=resumable" in call[0][1] + ), + None, + ) + assert initiation_url is not None + assert "projects/custom-project" in initiation_url + + +def test__do_resumable_upload_custom_timeout(): + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + transport = _make_transport(_make_resumable_upload_responses(file_obj_len)) + client = _make_client(_http=transport) + + client._do_resumable_upload( + file_obj, EXPECTED_CONFIGURATION, num_retries=0, timeout=3.14 + ) + + for call_args in transport.request.call_args_list: + assert call_args[1].get("timeout") == 3.14 + + +def test__do_multipart_upload_request_body(): + transport = _make_transport([_make_response(http.client.OK)]) + client = _make_client(_http=transport) + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + + client._do_multipart_upload( + file_obj, EXPECTED_CONFIGURATION, file_obj_len, None, None + ) + + request_args = transport.request.mock_calls[0][2] + request_data = request_args["data"].decode("utf-8") + request_headers = request_args["headers"] + + request_content = email.message_from_string( + "Content-Type: {}\n{}".format( + request_headers["content-type"].decode("utf-8"), request_data + ) + ) + + configuration_data = request_content.get_payload(0).get_payload() + binary_data = request_content.get_payload(1).get_payload() + + assert json.loads(configuration_data) == EXPECTED_CONFIGURATION + assert binary_data.encode("utf-8") == file_obj.getvalue() + + +def test__do_multipart_upload_wrong_size(): + client = _make_client() + file_obj = _make_file_obj() + file_obj_len = len(file_obj.getvalue()) + + with pytest.raises(ValueError): + client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) + + +def test_schema_from_json_with_file_path(): + from google.cloud.bigquery.schema import SchemaField + + file_content = """ + [ + { + "description": "quarter", + "mode": "REQUIRED", + "name": "qtr", + "type": "STRING" + }, + { + "description": "sales representative", + "mode": "NULLABLE", + "name": "rep", + "type": "STRING" + }, + { + "description": "total sales", + "mode": "NULLABLE", + "name": "sales", + "type": "FLOAT" + } + ]""" + + expected = [ + SchemaField("qtr", "STRING", "REQUIRED", description="quarter"), + SchemaField( + "rep", + "STRING", + "NULLABLE", + description="sales representative", + ), + SchemaField( + "sales", + "FLOAT", + "NULLABLE", + description="total sales", + ), + ] + + client = _make_client() + mock_file_path = "/mocked/file.json" + + open_patch = mock.patch("builtins.open", new=mock.mock_open(read_data=file_content)) + + with open_patch as _mock_file: + actual = client.schema_from_json(mock_file_path) + _mock_file.assert_called_once_with(mock_file_path) + _mock_file.return_value.read.assert_called_once() + + assert expected == actual diff --git a/tests/unit/test_client_retry.py b/tests/unit/test_client_retry.py new file mode 100644 index 000000000..6e49cc464 --- /dev/null +++ b/tests/unit/test_client_retry.py @@ -0,0 +1,279 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import freezegun +import google.api_core.exceptions +from google.cloud.bigquery import job as bqjob +from google.cloud.bigquery.retry import DEFAULT_RETRY +from .helpers import make_connection + + +PROJECT = "test-project" + + +def _make_credentials(): + import google.auth.credentials + + return mock.Mock(spec=google.auth.credentials.Credentials) + + +def _make_client(*args, **kw): + from google.cloud.bigquery.client import Client + + return Client(*args, **kw) + + +def test_get_service_account_email_w_custom_retry(global_time_lock): + api_path = f"/projects/{PROJECT}/serviceAccount" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + + resource = { + "kind": "bigquery#getServiceAccountResponse", + "email": "bq-123@bigquery-encryption.iam.gserviceaccount.com", + } + api_request_patcher = mock.patch.object( + client._connection, + "api_request", + side_effect=[ValueError, resource], + ) + + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, ValueError) + ) + + with api_request_patcher as fake_api_request: + with mock.patch( + "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" + ) as final_attributes: + service_account_email = client.get_service_account_email( + retry=retry, timeout=7.5 + ) + + final_attributes.assert_called_once_with({"path": api_path}, client, None) + assert service_account_email == "bq-123@bigquery-encryption.iam.gserviceaccount.com" + assert fake_api_request.call_args_list == [ + mock.call(method="GET", path=api_path, timeout=7.5), + mock.call(method="GET", path=api_path, timeout=7.5), # was retried once + ] + + +def test_call_api_applying_custom_retry_on_timeout(global_time_lock): + from concurrent.futures import TimeoutError + + creds = _make_credentials() + client = _make_client(project=PROJECT, credentials=creds) + + api_request_patcher = mock.patch.object( + client._connection, + "api_request", + side_effect=[TimeoutError, "result"], + ) + retry = DEFAULT_RETRY.with_deadline(1).with_predicate( + lambda exc: isinstance(exc, TimeoutError) + ) + + with api_request_patcher as fake_api_request: + result = client._call_api(retry, foo="bar") + + assert result == "result" + assert fake_api_request.call_args_list == [ + mock.call(foo="bar"), + mock.call(foo="bar"), + ] + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404( + global_time_lock, +): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + Sometimes after a Conflict, the fetch fails with a 404, but we know + because of the conflict that really the job does exist. Retry until we + get the job status (or timeout). + """ + job_id = "abc123" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection( + # We're mocking QueryJob._begin, so this is only going to be + # jobs.get requests and responses. + google.api_core.exceptions.TooManyRequests("this is retriable by default"), + google.api_core.exceptions.NotFound("we lost your job"), + google.api_core.exceptions.NotFound("we lost your job again, sorry"), + { + "jobReference": { + "projectId": PROJECT, + "location": "TESTLOC", + "jobId": job_id, + } + }, + ) + + job_create_error = google.api_core.exceptions.Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + bqjob.QueryJob, "_begin", side_effect=job_create_error + ) + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + return_value=job_id, + ) + + with job_begin_patcher, job_id_patcher: + # If get job request fails there does exist a job + # with this ID already, retry 404 until we get it (or fails for a + # non-retriable reason, see other tests). + result = client.query("SELECT 1;", job_id=None) + + jobs_get_path = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + # Double-check that it was jobs.get that was called for each of our + # mocked responses. + [jobs_get_path] + * 4, + ) + assert result.job_id == job_id + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retries_404_and_query_job_insert( + global_time_lock, +): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + Sometimes after a Conflict, the fetch fails with a 404. If it keeps + failing with a 404, assume that the job actually doesn't exist. + """ + job_id_1 = "abc123" + job_id_2 = "xyz789" + creds = _make_credentials() + http = object() + client = _make_client(project=PROJECT, credentials=creds, _http=http) + + # We're mocking QueryJob._begin, so that the connection should only get + # jobs.get requests. + job_create_error = google.api_core.exceptions.Conflict("Job already exists.") + job_begin_patcher = mock.patch.object( + bqjob.QueryJob, "_begin", side_effect=job_create_error + ) + conn = client._connection = make_connection( + google.api_core.exceptions.NotFound("we lost your job again, sorry"), + { + "jobReference": { + "projectId": PROJECT, + "location": "TESTLOC", + "jobId": job_id_2, + } + }, + ) + + # Choose a small deadline so the 404 retries give up. + retry = google.cloud.bigquery.retry._DEFAULT_GET_JOB_CONFLICT_RETRY.with_deadline(1) + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + side_effect=[job_id_1, job_id_2], + ) + retry_patcher = mock.patch.object( + google.cloud.bigquery.retry, + "_DEFAULT_GET_JOB_CONFLICT_RETRY", + retry, + ) + + with freezegun.freeze_time( + "2025-01-01 00:00:00", + # 10x the retry deadline to guarantee a timeout. + auto_tick_seconds=10, + ), job_begin_patcher, job_id_patcher, retry_patcher: + # If get job request fails there does exist a job + # with this ID already, retry 404 until we get it (or fails for a + # non-retriable reason, see other tests). + result = client.query("SELECT 1;", job_id=None) + + jobs_get_path_1 = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id_1}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + jobs_get_path_2 = mock.call( + method="GET", + path=f"/projects/{PROJECT}/jobs/{job_id_2}", + query_params={ + "projection": "full", + }, + timeout=google.cloud.bigquery.retry.DEFAULT_GET_JOB_TIMEOUT, + ) + conn.api_request.assert_has_calls( + # Double-check that it was jobs.get that was called for each of our + # mocked responses. + [jobs_get_path_1, jobs_get_path_2], + ) + assert result.job_id == job_id_2 + + +def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_retry(global_time_lock): + """Regression test for https://github.com/googleapis/python-bigquery/issues/2134 + + If we get a 409 conflict on jobs.insert, and we are using a random + job ID, we should retry by getting the job by ID. This test ensures that + if the get job by ID fails, we retry the whole sequence. + """ + from google.cloud.bigquery import job + + client = _make_client(project=PROJECT, credentials=_make_credentials()) + job_id = "some-random-job-id" + query_text = "SELECT 1" + job_config = job.QueryJobConfig() + job_config.use_legacy_sql = False + + job_resource = { + "jobReference": {"projectId": PROJECT, "jobId": job_id}, + "configuration": {"query": {"query": query_text}}, + "status": {"state": "DONE"}, + } + + conn = make_connection( + # First attempt at jobs.insert fails with a 409 + google.api_core.exceptions.Conflict("Job already exists."), + # First attempt at jobs.get fails with a 500 + google.api_core.exceptions.InternalServerError("get job failed"), + # Second attempt at jobs.insert succeeds + job_resource, + ) + client._connection = conn + + job_id_patcher = mock.patch.object( + google.cloud.bigquery._job_helpers, + "make_job_id", + return_value=job_id, + ) + + with job_id_patcher: + query_job = client.query(query_text, job_config=job_config, job_id=None) + + assert query_job.job_id == job_id diff --git a/tests/unit/test_job_retry.py b/tests/unit/test_job_retry.py index 7144c640b..7343fed3d 100644 --- a/tests/unit/test_job_retry.py +++ b/tests/unit/test_job_retry.py @@ -80,7 +80,7 @@ ), ], ) -def test_retry_failed_jobs(sleep, reason, job_retry, result_retry): +def test_retry_failed_jobs(sleep, reason, job_retry, result_retry, global_time_lock): client = make_client() err = dict(reason=reason) conn = client._connection = make_connection( @@ -138,7 +138,7 @@ def test_retry_failed_jobs(sleep, reason, job_retry, result_retry): def test_retry_connection_error_with_default_retries_and_successful_first_job( - monkeypatch, client + monkeypatch, client, global_time_lock ): """ Make sure ConnectionError can be retried at `is_job_done` level, even if @@ -254,7 +254,7 @@ def make_job_id(*args, **kwargs): def test_query_retry_with_default_retry_and_ambiguous_errors_only_retries_with_failed_job( - client, monkeypatch + client, monkeypatch, global_time_lock ): """ Some errors like 'rateLimitExceeded' can be ambiguous. Make sure we only @@ -419,7 +419,7 @@ def make_job_id(*args, **kwargs): # - Pass None retry to `result`. @pytest.mark.parametrize("job_retry_on_query", ["Query", "Result"]) @mock.patch("time.sleep") -def test_disable_retry_failed_jobs(sleep, client, job_retry_on_query): +def test_disable_retry_failed_jobs(sleep, client, job_retry_on_query, global_time_lock): """ Test retry of job failures, as opposed to API-invocation failures. """ @@ -450,7 +450,7 @@ def api_request(method, path, query_params=None, data=None, **kw): @mock.patch("time.sleep") -def test_retry_failed_jobs_after_retry_failed(sleep, client): +def test_retry_failed_jobs_after_retry_failed(sleep, client, global_time_lock): """ If at first you don't succeed, maybe you will later. :) """ @@ -508,7 +508,7 @@ def api_request(method, path, query_params=None, data=None, **kw): assert job.job_id != orig_job_id -def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client): +def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client, global_time_lock): with pytest.raises( TypeError, match=( @@ -520,7 +520,9 @@ def test_raises_on_job_retry_on_query_with_non_retryable_jobs(client): client.query("select 42", job_id=42, job_retry=google.api_core.retry.Retry()) -def test_raises_on_job_retry_on_result_with_non_retryable_jobs(client): +def test_raises_on_job_retry_on_result_with_non_retryable_jobs( + client, global_time_lock +): client._connection = make_connection({}) with pytest.warns( @@ -542,7 +544,7 @@ def test_raises_on_job_retry_on_result_with_non_retryable_jobs(client): job.result(job_retry=google.api_core.retry.Retry()) -def test_query_and_wait_retries_job_for_DDL_queries(): +def test_query_and_wait_retries_job_for_DDL_queries(global_time_lock): """ Specific test for retrying DDL queries with "jobRateLimitExceeded" error: https://github.com/googleapis/python-bigquery/issues/1790 diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 2b704d3c9..adb43bcd9 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -2016,6 +2016,54 @@ def test_slot_millis_present_string(self): query = self._make_one(resource) self.assertEqual(query.slot_millis, 123456) + def test_created_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.created) + + def test_created_present_integer(self): + resource = self._make_resource() + resource["creationTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.created.timestamp() * 1000, 1437767599006) + + def test_created_present_string(self): + resource = self._make_resource() + resource["creationTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.created.timestamp() * 1000, 1437767599006) + + def test_started_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.started) + + def test_started_present_integer(self): + resource = self._make_resource() + resource["startTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.started.timestamp() * 1000, 1437767599006) + + def test_started_present_string(self): + resource = self._make_resource() + resource["startTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.started.timestamp() * 1000, 1437767599006) + + def test_ended_missing(self): + query = self._make_one(self._make_resource()) + self.assertIsNone(query.ended) + + def test_ended_present_integer(self): + resource = self._make_resource() + resource["endTime"] = 1437767599006 + query = self._make_one(resource) + self.assertEqual(query.ended.timestamp() * 1000, 1437767599006) + + def test_ended_present_string(self): + resource = self._make_resource() + resource["endTime"] = "1437767599006" + query = self._make_one(resource) + self.assertEqual(query.ended.timestamp() * 1000, 1437767599006) + def test_num_dml_affected_rows_missing(self): query = self._make_one(self._make_resource()) self.assertIsNone(query.num_dml_affected_rows)