From 6b599f91453fbac67d9285708b3b814c6eed3c58 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 21 May 2024 11:27:47 -0500 Subject: [PATCH 1/3] perf: decrease the threshold in which we use the BQ Storage Read API --- google/cloud/bigquery/table.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index ad1253195..f3e3de31c 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -108,7 +108,10 @@ # How many of the total rows need to be downloaded already for us to skip # calling the BQ Storage API? -ALMOST_COMPLETELY_CACHED_RATIO = 0.333 +# In microbenchmarks, the first full page will be about 10 MB. If a page is +# about 2 MB, then it's the same speed or faster to download all the results +# using the BigQuery Storage Read API. +ALMOST_COMPLETELY_CACHED_RATIO = 10 / (10 + 2) def _reference_getter(table): From 3e53c4cd415483c8e93f255dc2b0a2a93f185313 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 21 May 2024 12:00:10 -0500 Subject: [PATCH 2/3] fix unit test --- tests/unit/test_table.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 099529f95..fcbba03aa 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2307,9 +2307,17 @@ def test__is_almost_completely_cached_returns_true_with_some_rows_remaining(self rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]}, + {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]}, + {"f": [{"v": "Pebbles Phlyntstone"}, {"v": "4"}]}, + {"f": [{"v": "Bamm-Bamm Rhubble"}, {"v": "5"}]}, + {"f": [{"v": "Joseph Rockhead"}, {"v": "32"}]}, + {"f": [{"v": "Perry Masonry"}, {"v": "33"}]}, ] first_page = {"pageToken": "next-page", "rows": rows} - iterator = self._make_one(first_page_response=first_page, total_rows=6) + iterator = self._make_one( + first_page_response=first_page, total_rows=len(rows) + 1 + ) self.assertTrue(iterator._is_almost_completely_cached()) def test__is_almost_completely_cached_returns_true_with_no_rows_remaining(self): From 54466434a47099cdde54f1b6015bb6a5a66d58d7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 21 May 2024 14:41:54 -0500 Subject: [PATCH 3/3] update comment --- google/cloud/bigquery/table.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index f3e3de31c..6ebb0709a 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -108,10 +108,17 @@ # How many of the total rows need to be downloaded already for us to skip # calling the BQ Storage API? -# In microbenchmarks, the first full page will be about 10 MB. If a page is -# about 2 MB, then it's the same speed or faster to download all the results -# using the BigQuery Storage Read API. -ALMOST_COMPLETELY_CACHED_RATIO = 10 / (10 + 2) +# +# In microbenchmarks on 2024-05-21, I (tswast@) measure that at about 2 MB of +# remaining results, it's faster to use the BQ Storage Read API to download +# the results than use jobs.getQueryResults. Since we don't have a good way to +# know the remaining bytes, we estimate by remaining number of rows. +# +# Except when rows themselves are larger, I observe that the a single page of +# results will be around 10 MB. Therefore, the proportion of rows already +# downloaded should be 10 (first page) / 12 (all results) or less for it to be +# worth it to make a call to jobs.getQueryResults. +ALMOST_COMPLETELY_CACHED_RATIO = 0.833333 def _reference_getter(table):