diff --git a/CHANGELOG.md b/CHANGELOG.md index e1a70f961..94db8bcdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Release History +# 3.7.2 (2025-01-31) + +- Updated the retry_dela_max and retry_timeout (databricks/databricks-sql-python#497 by @jprakash-db) + +# 3.7.1 (2025-01-07) + +- Relaxed the number of Http retry attempts (databricks/databricks-sql-python#486 by @jprakash-db) + +# 3.7.0 (2024-12-23) + +- Fix: Incorrect number of rows fetched in inline results when fetching results with FETCH_NEXT orientation (databricks/databricks-sql-python#479 by @jprakash-db) +- Updated the doc to specify native parameters are not supported in PUT operation (databricks/databricks-sql-python#477 by @jprakash-db) +- Relax `pyarrow` and `numpy` pin (databricks/databricks-sql-python#452 by @arredond) +- Feature: Support for async execute has been added (databricks/databricks-sql-python#463 by @jprakash-db) +- Updated the HTTP retry logic to be similar to the other Databricks drivers (databricks/databricks-sql-python#467 by @jprakash-db) + # 3.6.0 (2024-10-25) - Support encryption headers in the cloud fetch request (https://github.com/databricks/databricks-sql-python/pull/460 by @jackyhu-db) diff --git a/docs/parameters.md b/docs/parameters.md index a538af1a6..f9f4c5ff9 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -17,6 +17,7 @@ See `examples/parameters.py` in this repository for a runnable demo. - A query executed with native parameters can contain at most 255 parameter markers - The maximum size of all parameterized values cannot exceed 1MB +- For volume operations such as PUT, native parameters are not supported ## SQL Syntax diff --git a/poetry.lock b/poetry.lock index 9fe49690f..576adbd35 100755 --- a/poetry.lock +++ b/poetry.lock @@ -1202,4 +1202,4 @@ sqlalchemy = ["sqlalchemy"] [metadata] lock-version = "2.0" python-versions = "^3.8.0" -content-hash = "31066a85f646d0009d6fe9ffc833a64fcb4b6923c2e7f2652e7aa8540acba298" +content-hash = "9d8a91369fc79f9ca9f7502e2ed284b66531c954ae59a723e465a76073966998" diff --git a/pyproject.toml b/pyproject.toml index 1747d21b1..6a38e33b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "databricks-sql-connector" -version = "3.6.0" +version = "3.7.2" description = "Databricks SQL Connector for Python" authors = ["Databricks "] license = "Apache-2.0" @@ -14,14 +14,14 @@ thrift = ">=0.16.0,<0.21.0" pandas = [ { version = ">=1.2.5,<2.3.0", python = ">=3.8" } ] -pyarrow = ">=14.0.1,<17" +pyarrow = ">=14.0.1" lz4 = "^4.0.2" requests = "^2.18.1" oauthlib = "^3.1.0" numpy = [ - { version = "^1.16.6", python = ">=3.8,<3.11" }, - { version = "^1.23.4", python = ">=3.11" }, + { version = ">=1.16.6", python = ">=3.8,<3.11" }, + { version = ">=1.23.4", python = ">=3.11" }, ] sqlalchemy = { version = ">=2.0.21", optional = true } openpyxl = "^3.0.10" diff --git a/src/databricks/sql/__init__.py b/src/databricks/sql/__init__.py index 42167b008..931e5c0af 100644 --- a/src/databricks/sql/__init__.py +++ b/src/databricks/sql/__init__.py @@ -68,7 +68,7 @@ def __repr__(self): DATE = DBAPITypeObject("date") ROWID = DBAPITypeObject() -__version__ = "3.6.0" +__version__ = "3.7.2" USER_AGENT_NAME = "PyDatabricksSqlConnector" # These two functions are pyhive legacy diff --git a/src/databricks/sql/auth/retry.py b/src/databricks/sql/auth/retry.py index 0c6547cb4..eedcc773f 100755 --- a/src/databricks/sql/auth/retry.py +++ b/src/databricks/sql/auth/retry.py @@ -1,4 +1,5 @@ import logging +import random import time import typing from enum import Enum @@ -285,25 +286,31 @@ def sleep_for_retry(self, response: BaseHTTPResponse) -> bool: """ retry_after = self.get_retry_after(response) if retry_after: - backoff = self.get_backoff_time() - proposed_wait = max(backoff, retry_after) - self.check_proposed_wait(proposed_wait) - time.sleep(proposed_wait) - return True + proposed_wait = retry_after + else: + proposed_wait = self.get_backoff_time() - return False + proposed_wait = max(proposed_wait, self.delay_max) + self.check_proposed_wait(proposed_wait) + logger.debug(f"Retrying after {proposed_wait} seconds") + time.sleep(proposed_wait) + return True def get_backoff_time(self) -> float: - """Calls urllib3's built-in get_backoff_time. + """ + This method implements the exponential backoff algorithm to calculate the delay between retries. Never returns a value larger than self.delay_max A MaxRetryDurationError will be raised if the calculated backoff would exceed self.max_attempts_duration - Note: within urllib3, a backoff is only calculated in cases where a Retry-After header is not present - in the previous unsuccessful request and `self.respect_retry_after_header` is True (which is always true) + :return: """ - proposed_backoff = super().get_backoff_time() + current_attempt = self.stop_after_attempts_count - int(self.total or 0) + proposed_backoff = (2**current_attempt) * self.delay_min + if self.backoff_jitter != 0.0: + proposed_backoff += random.random() * self.backoff_jitter + proposed_backoff = min(proposed_backoff, self.delay_max) self.check_proposed_wait(proposed_backoff) diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py index 4e0ab941b..aefed1ef0 100755 --- a/src/databricks/sql/client.py +++ b/src/databricks/sql/client.py @@ -1,3 +1,4 @@ +import time from typing import Dict, Tuple, List, Optional, Any, Union, Sequence import pandas @@ -47,6 +48,7 @@ from databricks.sql.thrift_api.TCLIService.ttypes import ( TSparkParameter, + TOperationState, ) @@ -430,6 +432,8 @@ def __init__( self.escaper = ParamEscaper() self.lastrowid = None + self.ASYNC_DEFAULT_POLLING_INTERVAL = 2 + # The ideal return type for this method is perhaps Self, but that was not added until 3.11, and we support pre-3.11 pythons, currently. def __enter__(self) -> "Cursor": return self @@ -796,6 +800,7 @@ def execute( cursor=self, use_cloud_fetch=self.connection.use_cloud_fetch, parameters=prepared_params, + async_op=False, ) self.active_result_set = ResultSet( self.connection, @@ -803,6 +808,7 @@ def execute( self.thrift_backend, self.buffer_size_bytes, self.arraysize, + self.connection.use_cloud_fetch, ) if execute_response.is_staging_operation: @@ -812,6 +818,106 @@ def execute( return self + def execute_async( + self, + operation: str, + parameters: Optional[TParameterCollection] = None, + ) -> "Cursor": + """ + + Execute a query and do not wait for it to complete and just move ahead + + :param operation: + :param parameters: + :return: + """ + param_approach = self._determine_parameter_approach(parameters) + if param_approach == ParameterApproach.NONE: + prepared_params = NO_NATIVE_PARAMS + prepared_operation = operation + + elif param_approach == ParameterApproach.INLINE: + prepared_operation, prepared_params = self._prepare_inline_parameters( + operation, parameters + ) + elif param_approach == ParameterApproach.NATIVE: + normalized_parameters = self._normalize_tparametercollection(parameters) + param_structure = self._determine_parameter_structure(normalized_parameters) + transformed_operation = transform_paramstyle( + operation, normalized_parameters, param_structure + ) + prepared_operation, prepared_params = self._prepare_native_parameters( + transformed_operation, normalized_parameters, param_structure + ) + + self._check_not_closed() + self._close_and_clear_active_result_set() + self.thrift_backend.execute_command( + operation=prepared_operation, + session_handle=self.connection._session_handle, + max_rows=self.arraysize, + max_bytes=self.buffer_size_bytes, + lz4_compression=self.connection.lz4_compression, + cursor=self, + use_cloud_fetch=self.connection.use_cloud_fetch, + parameters=prepared_params, + async_op=True, + ) + + return self + + def get_query_state(self) -> "TOperationState": + """ + Get the state of the async executing query or basically poll the status of the query + + :return: + """ + self._check_not_closed() + return self.thrift_backend.get_query_state(self.active_op_handle) + + def get_async_execution_result(self): + """ + + Checks for the status of the async executing query and fetches the result if the query is finished + Otherwise it will keep polling the status of the query till there is a Not pending state + :return: + """ + self._check_not_closed() + + def is_executing(operation_state) -> "bool": + return not operation_state or operation_state in [ + ttypes.TOperationState.RUNNING_STATE, + ttypes.TOperationState.PENDING_STATE, + ] + + while is_executing(self.get_query_state()): + # Poll after some default time + time.sleep(self.ASYNC_DEFAULT_POLLING_INTERVAL) + + operation_state = self.get_query_state() + if operation_state == ttypes.TOperationState.FINISHED_STATE: + execute_response = self.thrift_backend.get_execution_result( + self.active_op_handle, self + ) + self.active_result_set = ResultSet( + self.connection, + execute_response, + self.thrift_backend, + self.buffer_size_bytes, + self.arraysize, + ) + + if execute_response.is_staging_operation: + self._handle_staging_operation( + staging_allowed_local_path=self.thrift_backend.staging_allowed_local_path + ) + + return self + else: + raise Error( + f"get_execution_result failed with Operation status {operation_state}" + ) + def executemany(self, operation, seq_of_parameters): """ Execute the operation once for every set of passed in parameters. @@ -1097,6 +1203,7 @@ def __init__( thrift_backend: ThriftBackend, result_buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES, arraysize: int = 10000, + use_cloud_fetch: bool = True, ): """ A ResultSet manages the results of a single command. @@ -1118,6 +1225,7 @@ def __init__( self.description = execute_response.description self._arrow_schema_bytes = execute_response.arrow_schema_bytes self._next_row_index = 0 + self._use_cloud_fetch = use_cloud_fetch if execute_response.arrow_queue: # In this case the server has taken the fast path and returned an initial batch of @@ -1145,6 +1253,7 @@ def _fill_results_buffer(self): lz4_compressed=self.lz4_compressed, arrow_schema_bytes=self._arrow_schema_bytes, description=self.description, + use_cloud_fetch=self._use_cloud_fetch, ) self.results = results self.has_more_rows = has_more_rows diff --git a/src/databricks/sql/thrift_backend.py b/src/databricks/sql/thrift_backend.py index cf5cd906b..9beab0371 100644 --- a/src/databricks/sql/thrift_backend.py +++ b/src/databricks/sql/thrift_backend.py @@ -7,6 +7,8 @@ import threading from typing import List, Union +from databricks.sql.thrift_api.TCLIService.ttypes import TOperationState + try: import pyarrow except ImportError: @@ -319,7 +321,7 @@ def _handle_request_error(self, error_info, attempt, elapsed): # FUTURE: Consider moving to https://github.com/litl/backoff or # https://github.com/jd/tenacity for retry logic. - def make_request(self, method, request): + def make_request(self, method, request, retryable=True): """Execute given request, attempting retries when 1. Receiving HTTP 429/503 from server 2. OSError is raised during a GetOperationStatus @@ -458,7 +460,7 @@ def attempt_request(attempt): # return on success # if available: bounded delay and retry # if not: raise error - max_attempts = self._retry_stop_after_attempts_count + max_attempts = self._retry_stop_after_attempts_count if retryable else 1 # use index-1 counting for logging/human consistency for attempt in range(1, max_attempts + 1): @@ -769,6 +771,63 @@ def _results_message_to_execute_response(self, resp, operation_state): arrow_schema_bytes=schema_bytes, ) + def get_execution_result(self, op_handle, cursor): + + assert op_handle is not None + + req = ttypes.TFetchResultsReq( + operationHandle=ttypes.TOperationHandle( + op_handle.operationId, + op_handle.operationType, + False, + op_handle.modifiedRowCount, + ), + maxRows=cursor.arraysize, + maxBytes=cursor.buffer_size_bytes, + orientation=ttypes.TFetchOrientation.FETCH_NEXT, + includeResultSetMetadata=True, + ) + + resp = self.make_request(self._client.FetchResults, req) + + t_result_set_metadata_resp = resp.resultSetMetadata + + lz4_compressed = t_result_set_metadata_resp.lz4Compressed + is_staging_operation = t_result_set_metadata_resp.isStagingOperation + has_more_rows = resp.hasMoreRows + description = self._hive_schema_to_description( + t_result_set_metadata_resp.schema + ) + + schema_bytes = ( + t_result_set_metadata_resp.arrowSchema + or self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema) + .serialize() + .to_pybytes() + ) + + queue = ResultSetQueueFactory.build_queue( + row_set_type=resp.resultSetMetadata.resultFormat, + t_row_set=resp.results, + arrow_schema_bytes=schema_bytes, + max_download_threads=self.max_download_threads, + lz4_compressed=lz4_compressed, + description=description, + ssl_options=self._ssl_options, + ) + + return ExecuteResponse( + arrow_queue=queue, + status=resp.status, + has_been_closed_server_side=False, + has_more_rows=has_more_rows, + lz4_compressed=lz4_compressed, + is_staging_operation=is_staging_operation, + command_handle=op_handle, + description=description, + arrow_schema_bytes=schema_bytes, + ) + def _wait_until_command_done(self, op_handle, initial_operation_status_resp): if initial_operation_status_resp: self._check_command_not_in_error_or_closed_state( @@ -787,6 +846,12 @@ def _wait_until_command_done(self, op_handle, initial_operation_status_resp): self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) return operation_state + def get_query_state(self, op_handle) -> "TOperationState": + poll_resp = self._poll_for_status(op_handle) + operation_state = poll_resp.operationState + self._check_command_not_in_error_or_closed_state(op_handle, poll_resp) + return operation_state + @staticmethod def _check_direct_results_for_error(t_spark_direct_results): if t_spark_direct_results: @@ -817,6 +882,7 @@ def execute_command( cursor, use_cloud_fetch=True, parameters=[], + async_op=False, ): assert session_handle is not None @@ -846,7 +912,11 @@ def execute_command( parameters=parameters, ) resp = self.make_request(self._client.ExecuteStatement, req) - return self._handle_execute_response(resp, cursor) + + if async_op: + self._handle_execute_response_async(resp, cursor) + else: + return self._handle_execute_response(resp, cursor) def get_catalogs(self, session_handle, max_rows, max_bytes, cursor): assert session_handle is not None @@ -945,6 +1015,10 @@ def _handle_execute_response(self, resp, cursor): return self._results_message_to_execute_response(resp, final_operation_state) + def _handle_execute_response_async(self, resp, cursor): + cursor.active_op_handle = resp.operationHandle + self._check_direct_results_for_error(resp.directResults) + def fetch_results( self, op_handle, @@ -954,6 +1028,7 @@ def fetch_results( lz4_compressed, arrow_schema_bytes, description, + use_cloud_fetch=True, ): assert op_handle is not None @@ -970,10 +1045,11 @@ def fetch_results( includeResultSetMetadata=True, ) - resp = self.make_request(self._client.FetchResults, req) + # Fetch results in Inline mode with FETCH_NEXT orientation are not idempotent and hence not retried + resp = self.make_request(self._client.FetchResults, req, use_cloud_fetch) if resp.results.startRowOffset > expected_row_start_offset: - logger.warning( - "Expected results to start from {} but they instead start at {}".format( + raise DataError( + "fetch_results failed due to inconsistency in the state between the client and the server. Expected results to start from {} but they instead start at {}, some result batches must have been skipped".format( expected_row_start_offset, resp.results.startRowOffset ) ) diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index 7dd5f7450..b5d01a45d 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -121,9 +121,9 @@ class PySQLRetryTestsMixin: # For testing purposes _retry_policy = { "_retry_delay_min": 0.1, - "_retry_delay_max": 5, + "_retry_delay_max": 3, "_retry_stop_after_attempts_count": 5, - "_retry_stop_after_attempts_duration": 10, + "_retry_stop_after_attempts_duration": 30, "_retry_delay_default": 0.5, } @@ -135,7 +135,7 @@ def test_retry_urllib3_settings_are_honored(self): urllib3_config = {"connect": 10, "read": 11, "redirect": 12} rp = DatabricksRetryPolicy( delay_min=0.1, - delay_max=10.0, + delay_max=3, stop_after_attempts_count=10, stop_after_attempts_duration=10.0, delay_default=1.0, @@ -174,14 +174,14 @@ def test_retry_max_count_not_exceeded(self): def test_retry_exponential_backoff(self): """GIVEN the retry policy is configured for reasonable exponential backoff WHEN the server sends nothing but 429 responses with retry-afters - THEN the connector will use those retry-afters as a floor + THEN the connector will use those retry-afters values as floor """ retry_policy = self._retry_policy.copy() retry_policy["_retry_delay_min"] = 1 time_start = time.time() with mocked_server_response( - status=429, headers={"Retry-After": "3"} + status=429, headers={"Retry-After": "8"} ) as mock_obj: with pytest.raises(RequestError) as cm: with self.connection(extra_params=retry_policy) as conn: @@ -191,14 +191,14 @@ def test_retry_exponential_backoff(self): assert isinstance(cm.value.args[1], MaxRetryDurationError) # With setting delay_min to 1, the expected retry delays should be: - # 3, 3, 4 - # The first 2 retries are allowed, the 3rd retry puts the total duration over the limit - # of 10 seconds - assert mock_obj.return_value.getresponse.call_count == 3 - assert duration > 6 - - # Should be less than 7, but this is a safe margin for CI/CD slowness - assert duration < 10 + # 8, 8, 8, 8 + # The first 3 retries are allowed, the 4th retry puts the total duration over the limit + # of 30 seconds + assert mock_obj.return_value.getresponse.call_count == 4 + assert duration > 24 + + # Should be less than 26, but this is a safe margin for CI/CD slowness + assert duration < 30 def test_retry_max_duration_not_exceeded(self): """GIVEN the max attempt duration of 10 seconds diff --git a/tests/e2e/test_driver.py b/tests/e2e/test_driver.py index cfd1e9699..2f0881cda 100644 --- a/tests/e2e/test_driver.py +++ b/tests/e2e/test_driver.py @@ -36,6 +36,7 @@ compare_dbr_versions, is_thrift_v5_plus, ) +from databricks.sql.thrift_api.TCLIService import ttypes from tests.e2e.common.core_tests import CoreTestMixin, SmokeTestMixin from tests.e2e.common.large_queries_mixin import LargeQueriesMixin from tests.e2e.common.timestamp_tests import TimestampTestsMixin @@ -78,6 +79,7 @@ class PySQLPytestTestCase: } arraysize = 1000 buffer_size_bytes = 104857600 + POLLING_INTERVAL = 2 @pytest.fixture(autouse=True) def get_details(self, connection_details): @@ -175,6 +177,27 @@ def test_cloud_fetch(self): for i in range(len(cf_result)): assert cf_result[i] == noop_result[i] + def test_execute_async(self): + def isExecuting(operation_state): + return not operation_state or operation_state in [ + ttypes.TOperationState.RUNNING_STATE, + ttypes.TOperationState.PENDING_STATE, + ] + + long_running_query = "SELECT COUNT(*) FROM RANGE(10000 * 16) x JOIN RANGE(10000) y ON FROM_UNIXTIME(x.id * y.id, 'yyyy-MM-dd') LIKE '%not%a%date%'" + with self.cursor() as cursor: + cursor.execute_async(long_running_query) + + ## Polling after every POLLING_INTERVAL seconds + while isExecuting(cursor.get_query_state()): + time.sleep(self.POLLING_INTERVAL) + log.info("Polling the status in test_execute_async") + + cursor.get_async_execution_result() + result = cursor.fetchall() + + assert result[0].asDict() == {"count(1)": 0} + # Exclude Retry tests because they require specific setups, and LargeQueries too slow for core # tests diff --git a/tests/unit/test_fetches.py b/tests/unit/test_fetches.py index e9a58acdd..89cedcfae 100644 --- a/tests/unit/test_fetches.py +++ b/tests/unit/test_fetches.py @@ -66,6 +66,7 @@ def fetch_results( lz4_compressed, arrow_schema_bytes, description, + use_cloud_fetch=True, ): nonlocal batch_index results = FetchTests.make_arrow_queue(batch_list[batch_index]) diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index 2108af4fa..897a1d111 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -1,10 +1,9 @@ -from os import error import time -from unittest.mock import Mock, patch +from unittest.mock import patch, call import pytest -from requests import Request from urllib3 import HTTPResponse -from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory +from databricks.sql.auth.retry import DatabricksRetryPolicy, RequestHistory, CommandType +from urllib3.exceptions import MaxRetryError class TestRetry: @@ -25,32 +24,62 @@ def error_history(self) -> RequestHistory: method="POST", url=None, error=None, status=503, redirect_location=None ) + def calculate_backoff_time(self, attempt, delay_min, delay_max): + exponential_backoff_time = (2**attempt) * delay_min + return min(exponential_backoff_time, delay_max) + @patch("time.sleep") def test_sleep__no_retry_after(self, t_mock, retry_policy, error_history): retry_policy._retry_start_time = time.time() retry_policy.history = [error_history, error_history] retry_policy.sleep(HTTPResponse(status=503)) - t_mock.assert_called_with(2) - @patch("time.sleep") - def test_sleep__retry_after_is_binding(self, t_mock, retry_policy, error_history): - retry_policy._retry_start_time = time.time() - retry_policy.history = [error_history, error_history] - retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"})) - t_mock.assert_called_with(3) + expected_backoff_time = max( + self.calculate_backoff_time( + 0, retry_policy.delay_min, retry_policy.delay_max + ), + retry_policy.delay_max, + ) + t_mock.assert_called_with(expected_backoff_time) @patch("time.sleep") - def test_sleep__retry_after_present_but_not_binding( - self, t_mock, retry_policy, error_history - ): + def test_sleep__no_retry_after_header__multiple_retries(self, t_mock, retry_policy): + num_attempts = retry_policy.stop_after_attempts_count + retry_policy._retry_start_time = time.time() - retry_policy.history = [error_history, error_history] - retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "1"})) - t_mock.assert_called_with(2) + retry_policy.command_type = CommandType.OTHER + + for attempt in range(num_attempts): + retry_policy.sleep(HTTPResponse(status=503)) + # Internally urllib3 calls the increment function generating a new instance for every retry + retry_policy = retry_policy.increment() + + expected_backoff_times = [] + for attempt in range(num_attempts): + expected_backoff_times.append( + max( + self.calculate_backoff_time( + attempt, retry_policy.delay_min, retry_policy.delay_max + ), + retry_policy.delay_max, + ) + ) + + # Asserts if the sleep value was called in the expected order + t_mock.assert_has_calls( + [call(expected_time) for expected_time in expected_backoff_times] + ) @patch("time.sleep") - def test_sleep__retry_after_surpassed(self, t_mock, retry_policy, error_history): + def test_excessive_retry_attempts_error(self, t_mock, retry_policy): + # Attempting more than stop_after_attempt_count + num_attempts = retry_policy.stop_after_attempts_count + 1 + retry_policy._retry_start_time = time.time() - retry_policy.history = [error_history, error_history, error_history] - retry_policy.sleep(HTTPResponse(status=503, headers={"Retry-After": "3"})) - t_mock.assert_called_with(4) + retry_policy.command_type = CommandType.OTHER + + with pytest.raises(MaxRetryError): + for attempt in range(num_attempts): + retry_policy.sleep(HTTPResponse(status=503)) + # Internally urllib3 calls the increment function generating a new instance for every retry + retry_policy = retry_policy.increment()