perf: decrease the threshold in which we use the BQ Storage Read API (#…

…1925) * perf: decrease the threshold in which we use the BQ Storage Read API * fix unit test * update comment
googleapis · May 21, 2024 · eaa1a52 · eaa1a52
1 parent 0dac714
commit eaa1a52
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 2 deletions.
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -108,7 +108,17 @@
 
 # How many of the total rows need to be downloaded already for us to skip
 # calling the BQ Storage API?
-ALMOST_COMPLETELY_CACHED_RATIO = 0.333
+#
+# In microbenchmarks on 2024-05-21, I (tswast@) measure that at about 2 MB of
+# remaining results, it's faster to use the BQ Storage Read API to download
+# the results than use jobs.getQueryResults. Since we don't have a good way to
+# know the remaining bytes, we estimate by remaining number of rows.
+#
+# Except when rows themselves are larger, I observe that the a single page of
+# results will be around 10 MB. Therefore, the proportion of rows already
+# downloaded should be 10 (first page) / 12 (all results) or less for it to be
+# worth it to make a call to jobs.getQueryResults.
+ALMOST_COMPLETELY_CACHED_RATIO = 0.833333
 
 
 def _reference_getter(table):

diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py
@@ -2307,9 +2307,17 @@ def test__is_almost_completely_cached_returns_true_with_some_rows_remaining(self
         rows = [
             {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
             {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
+            {"f": [{"v": "Whillma Phlyntstone"}, {"v": "27"}]},
+            {"f": [{"v": "Bhetty Rhubble"}, {"v": "28"}]},
+            {"f": [{"v": "Pebbles Phlyntstone"}, {"v": "4"}]},
+            {"f": [{"v": "Bamm-Bamm Rhubble"}, {"v": "5"}]},
+            {"f": [{"v": "Joseph Rockhead"}, {"v": "32"}]},
+            {"f": [{"v": "Perry Masonry"}, {"v": "33"}]},
         ]
         first_page = {"pageToken": "next-page", "rows": rows}
-        iterator = self._make_one(first_page_response=first_page, total_rows=6)
+        iterator = self._make_one(
+            first_page_response=first_page, total_rows=len(rows) + 1
+        )
         self.assertTrue(iterator._is_almost_completely_cached())
 
     def test__is_almost_completely_cached_returns_true_with_no_rows_remaining(self):