From 64b267ee594f8da664b033cdb51840bd7d0b0667 Mon Sep 17 00:00:00 2001 From: jana-selva Date: Wed, 29 Apr 2026 15:41:54 +0530 Subject: [PATCH 1/3] Fix SaaS DB lifecycle retries and stale ID recovery --- exasol/saas/client/api_access.py | 77 +++++++++++++++++++++++++++++--- test/unit/test_api_access.py | 60 +++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 7 deletions(-) diff --git a/exasol/saas/client/api_access.py b/exasol/saas/client/api_access.py index 2c0d8a4..f2bafcd 100644 --- a/exasol/saas/client/api_access.py +++ b/exasol/saas/client/api_access.py @@ -254,6 +254,31 @@ class OpenApiAccess: def __init__(self, client: openapi.AuthenticatedClient, account_id: str): self._client = client self._account_id = account_id + self._database_name_by_id: dict[str, str] = {} + + def _try_refresh_database_id(self, database_id: str) -> str | None: + """ + Try to resolve a potentially stale database id using the remembered + database name from create_database(). + """ + database_name = self._database_name_by_id.get(database_id) + if not database_name: + return database_id + try: + refreshed_database_id = _get_database_id( + self._account_id, self._client, database_name + ) + except RuntimeError: + return None + if refreshed_database_id != database_id: + LOG.warning( + "Recovered stale database ID for '%s': %s -> %s", + database_name, + database_id, + refreshed_database_id, + ) + self._database_name_by_id[refreshed_database_id] = database_name + return refreshed_database_id def create_database( self, @@ -289,6 +314,7 @@ def minutes(x: timedelta) -> int: database = ensure_type( ExasolDatabase, resp, f"Failed to create database {name}" ) + self._database_name_by_id[database.id] = name LOG.info("Created database with ID %s", database.id) return database @@ -325,10 +351,13 @@ def delete_database( max_interval: timedelta = timedelta(minutes=2), ) -> None: def is_retry(resp: ApiError) -> bool: + msg = resp.message.lower() return ( resp.status == 400 - and "cluster is not in a proper state" in resp.message - ) + and "cluster is not in a proper state" in msg + ) or resp.status in (429, 500, 502, 503, 504) + + current_database_id = database_id @retry( wait=wait_exponential( @@ -340,20 +369,39 @@ def is_retry(resp: ApiError) -> bool: retry=retry_if_exception_type(TryAgain), ) def delete_with_retry() -> None: + nonlocal current_database_id LOG.info("- Trying to delete ...") + refreshed_database_id = self._try_refresh_database_id(current_database_id) + if refreshed_database_id is None: + LOG.info("Database %s is not listed anymore.", current_database_id) + return + current_database_id = refreshed_database_id resp = delete_database.sync( self._account_id, - database_id, + current_database_id, client=self._client, ) if not isinstance(resp, ApiError): # success return + if ( + resp.status == 404 + and "user/database not found" in resp.message.lower() + and self._database_name_by_id.get(current_database_id) + ): + refreshed_database_id = self._try_refresh_database_id( + current_database_id + ) + if refreshed_database_id is None: + LOG.info("Database %s is already deleted.", current_database_id) + return + current_database_id = refreshed_database_id + raise TryAgain if is_retry(resp): raise TryAgain raise InternalError(f"HTTP {resp.status}: {resp.message}.") - LOG.info("Got request to delete database with ID %s", database_id) + LOG.info("Got request to delete database with ID %s", current_database_id) try: delete_with_retry() LOG.info("Successfully deleted database.") @@ -411,21 +459,36 @@ def wait_until_running( database_id: str, timeout: timedelta = timedelta(minutes=30), interval: timedelta = timedelta(minutes=2), - ): + ) -> str: success = [Status.RUNNING] + current_database_id = database_id @interval_retry(interval, timeout) def poll_status() -> Status: - db = self.get_database(database_id) + nonlocal current_database_id + try: + db = self.get_database(current_database_id) + except OpenApiError as ex: + if "user/database not found" not in str(ex).lower(): + raise + refreshed_database_id = self._try_refresh_database_id( + current_database_id + ) + if refreshed_database_id is None: + LOG.info("- Database %s not listed yet ...", current_database_id) + raise TryAgain + current_database_id = refreshed_database_id + raise TryAgain status = db.status if db else None if status not in success: LOG.info("- Database status: %s ...", status) raise TryAgain return status - LOG.info("Waiting for database with ID %s to be available:", database_id) + LOG.info("Waiting for database with ID %s to be available:", current_database_id) if poll_status() not in success: raise DatabaseStartupFailure() + return current_database_id def clusters( self, diff --git a/test/unit/test_api_access.py b/test/unit/test_api_access.py index 0aa4164..22d8223 100644 --- a/test/unit/test_api_access.py +++ b/test/unit/test_api_access.py @@ -6,10 +6,12 @@ from exasol.saas.client.api_access import ( DatabaseDeleteError, + OpenApiError, OpenApiAccess, timestamp_name, ) from exasol.saas.client.openapi.models.api_error import ApiError +from exasol.saas.client.openapi.models.status import Status def response(status_code: int, message: str, spec=None): @@ -84,6 +86,12 @@ def test_delete_fail(api_mock, monkeypatch, side_effect, retry_timings) -> None: "", id="success_after_retry", ), + pytest.param( + [api_error(500, "Internal server error"), response(200, "")], + False, + "", + id="success_after_http_500_retry", + ), pytest.param( [api_error(400, "bla")], True, @@ -112,6 +120,58 @@ def test_delete_success( assert expected_log_message in caplog.text +def test_wait_until_running_recovers_stale_database_id(api_mock, monkeypatch) -> None: + from exasol.saas.client import api_access + + api_mock._database_name_by_id["old-id"] = "db-name" + + get_database_calls = [] + + def get_database_side_effect(database_id): + get_database_calls.append(database_id) + if database_id == "old-id": + raise OpenApiError( + "Failed to get database old-id", + api_error(404, "User/Database not found"), + ) + return Mock(status=Status.RUNNING) + + monkeypatch.setattr(api_mock, "get_database", get_database_side_effect) + monkeypatch.setattr(api_access, "_get_database_id", Mock(return_value="new-id")) + + with not_raises(Exception): + running_database_id = api_mock.wait_until_running( + "old-id", + timeout=timedelta(seconds=0.3), + interval=timedelta(seconds=0.1), + ) + + assert running_database_id == "new-id" + assert get_database_calls == ["old-id", "new-id"] + + +def test_delete_recovers_stale_database_id(api_mock, monkeypatch, retry_timings) -> None: + from exasol.saas.client import api_access + + api_mock._database_name_by_id["old-id"] = "db-name" + + delete_calls = [] + + def delete_side_effect(account_id, database_id, client): + delete_calls.append(database_id) + if database_id == "old-id": + return api_error(404, "User/Database not found") + return response(200, "") + + monkeypatch.setattr(api_access.delete_database, "sync", delete_side_effect) + monkeypatch.setattr(api_access, "_get_database_id", Mock(return_value="new-id")) + + with not_raises(Exception): + api_mock.delete_database("old-id", **retry_timings) + + assert delete_calls == ["new-id"] + + def test_timestamp_name() -> None: names = [timestamp_name("TEST") for _ in range(3)] minutes = [int(name[:5], 16) for name in names] From efd8eaf4aefeda48634ccf99efca1dada8266b74 Mon Sep 17 00:00:00 2001 From: jana-selva Date: Wed, 29 Apr 2026 15:52:20 +0530 Subject: [PATCH 2/3] Apply nox format fixes for SaaS lifecycle patch --- exasol/saas/client/api_access.py | 7 ++++--- test/unit/test_api_access.py | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/exasol/saas/client/api_access.py b/exasol/saas/client/api_access.py index f2bafcd..760f6c9 100644 --- a/exasol/saas/client/api_access.py +++ b/exasol/saas/client/api_access.py @@ -353,8 +353,7 @@ def delete_database( def is_retry(resp: ApiError) -> bool: msg = resp.message.lower() return ( - resp.status == 400 - and "cluster is not in a proper state" in msg + resp.status == 400 and "cluster is not in a proper state" in msg ) or resp.status in (429, 500, 502, 503, 504) current_database_id = database_id @@ -485,7 +484,9 @@ def poll_status() -> Status: raise TryAgain return status - LOG.info("Waiting for database with ID %s to be available:", current_database_id) + LOG.info( + "Waiting for database with ID %s to be available:", current_database_id + ) if poll_status() not in success: raise DatabaseStartupFailure() return current_database_id diff --git a/test/unit/test_api_access.py b/test/unit/test_api_access.py index 22d8223..4c9d667 100644 --- a/test/unit/test_api_access.py +++ b/test/unit/test_api_access.py @@ -6,8 +6,8 @@ from exasol.saas.client.api_access import ( DatabaseDeleteError, - OpenApiError, OpenApiAccess, + OpenApiError, timestamp_name, ) from exasol.saas.client.openapi.models.api_error import ApiError @@ -150,7 +150,9 @@ def get_database_side_effect(database_id): assert get_database_calls == ["old-id", "new-id"] -def test_delete_recovers_stale_database_id(api_mock, monkeypatch, retry_timings) -> None: +def test_delete_recovers_stale_database_id( + api_mock, monkeypatch, retry_timings +) -> None: from exasol.saas.client import api_access api_mock._database_name_by_id["old-id"] = "db-name" From 764e8dd47b18f4efb9a7fce16a1228ecdbdcabd8 Mon Sep 17 00:00:00 2001 From: jana-selva Date: Wed, 29 Apr 2026 16:34:01 +0530 Subject: [PATCH 3/3] Fix integration ID handling and delete retry semantics --- exasol/saas/client/api_access.py | 8 ++++---- test/integration/conftest.py | 3 +-- test/integration/test_databases.py | 25 ++++++++++++++++--------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/exasol/saas/client/api_access.py b/exasol/saas/client/api_access.py index 760f6c9..4d06e1e 100644 --- a/exasol/saas/client/api_access.py +++ b/exasol/saas/client/api_access.py @@ -372,8 +372,8 @@ def delete_with_retry() -> None: LOG.info("- Trying to delete ...") refreshed_database_id = self._try_refresh_database_id(current_database_id) if refreshed_database_id is None: - LOG.info("Database %s is not listed anymore.", current_database_id) - return + LOG.info("- Database %s is not listed yet ...", current_database_id) + raise TryAgain current_database_id = refreshed_database_id resp = delete_database.sync( self._account_id, @@ -392,8 +392,8 @@ def delete_with_retry() -> None: current_database_id ) if refreshed_database_id is None: - LOG.info("Database %s is already deleted.", current_database_id) - return + LOG.info("- Database %s is not listed yet ...", current_database_id) + raise TryAgain current_database_id = refreshed_database_id raise TryAgain if is_retry(resp): diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 272ba1b..d1c0871 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -53,8 +53,7 @@ def saas_database( @pytest.fixture(scope="session") def operational_saas_database_id(api_access, database_name) -> str: with api_access.database(database_name) as db: - api_access.wait_until_running(db.id) - yield db.id + yield api_access.wait_until_running(db.id) @pytest.fixture(scope="session") diff --git a/test/integration/test_databases.py b/test/integration/test_databases.py index f338799..a126344 100644 --- a/test/integration/test_databases.py +++ b/test/integration/test_databases.py @@ -9,6 +9,7 @@ from exasol.saas.client import PROMISING_STATES from exasol.saas.client.api_access import ( + _get_database_id, timestamp_name, ) from exasol.saas.client.openapi.models.exasol_database import ExasolDatabase @@ -47,9 +48,9 @@ def wait_until_running_too_short(db: ExasolDatabase): interval=timedelta(seconds=10), ) - def get_connection(db: ExasolDatabase): - clusters = api_access.clusters(db.id) - return api_access.get_connection(db.id, clusters[0].id) + def get_connection(database_id: str): + clusters = api_access.clusters(database_id) + return api_access.get_connection(database_id, clusters[0].id) with api_access.database(local_name, ignore_delete_failure=True) as db: start = datetime.now() @@ -59,13 +60,19 @@ def get_connection(db: ExasolDatabase): with pytest.raises(RetryError): wait_until_running_too_short(db) - # verify database is listed - assert db.id in api_access.list_database_ids() + # resolve the effective database ID in case the ID returned by + # create_database() is stale due eventual consistency. + database_id = _get_database_id( + api_access._account_id, # noqa: SLF001 + api_access._client, # noqa: SLF001 + local_name, + ) + assert database_id in api_access.list_database_ids() - con = get_connection(db) + con = get_connection(database_id) assert con.db_username is not None and con.port == 8563 # delete database and verify database is not listed anymore - api_access.delete_database(db.id) - api_access.wait_until_deleted(db.id) - assert db.id not in api_access.list_database_ids() + api_access.delete_database(database_id) + api_access.wait_until_deleted(database_id) + assert database_id not in api_access.list_database_ids()