From 7bc4154771175431e42024d8c7a7d962eb93237a Mon Sep 17 00:00:00 2001 From: pasinskim Date: Wed, 15 Apr 2026 14:56:24 +0200 Subject: [PATCH 1/8] chore(test): poll deviceconnect readiness instead of fixed sleeps The docker-client fixture is function-scoped (since 59f3afc2), so the deviceconnect tests start each run with a freshly-bootstrapped client and race the deviceconnect session handshake. The original tests papered over this with fixed sleeps (e.g. 128s after iptables -D, 10s after restarting deviceconnect) that are both slow and unreliable: the docker-client's network stack varies, TCP RTO compounds the recovery time on the OUTPUT -j DROP rule (no ICMP/RST is sent back), and the management /connect endpoint returns HTTP 404 ("device disconnected") until the device side is back -- which no fixed sleep can guarantee. Poll the actual readiness instead: retry the full open-websocket + startShell + verify-shell sequence via redo.retriable, catching the 404 (websockets.WebSocketException) and recv TimeoutError, until a working shell is established. Tolerate the narrow "shell from a previous attempt is still running" race that can occur if a retry fires after a successful startShell flapped mid-verify (the device's shell limit is 1). For test_in_poor_network_environment specifically, after iptables -D we restart mender-connect on the device. mender-connect's reconnect backoff caps at 30 minutes (connectionmanager/exponentialbackoff.go in mender-connect) and resetBackoff() is only invoked on a successful connection -- so after a 128s outage, compounded by TCP RTO on the DROP rule, the next reconnect attempt can be minutes away, far beyond a reasonable test budget. Killing the process lets the entrypoint's supervise loop respawn it at attempts=0 so it reconnects on its first cycle. This is a deliberate test-semantics change: the assertion is now that mender-connect re-establishes its deviceconnect session after a restart following the network heal (which is what a watchdog/operator would do in practice), rather than waiting out the autonomous backoff timer. Mirrors the kill+respawn pattern in test_filetransfer.update_limits(). The same retry pattern is applied to test_websocket_reconnect, which suffered an identical 404 race after restarting mender-deviceconnect. Related to QA-1527, QA-1563 and QA-1591. Changelog: Poll the deviceconnect /connect endpoint for shell readiness in the remote-terminal tests, and restart mender-connect after the network-heal step in test_in_poor_network_environment to avoid its 30-minute auto-recovery backoff cap. Signed-off-by: pasinskim --- tests/tests/test_mender_connect.py | 96 +++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/tests/tests/test_mender_connect.py b/tests/tests/test_mender_connect.py index 329afb8dc..4948d2a95 100644 --- a/tests/tests/test_mender_connect.py +++ b/tests/tests/test_mender_connect.py @@ -18,6 +18,8 @@ import uuid from flaky import flaky +from redo import retriable +from websockets.exceptions import WebSocketException from testutils.api import proto_shell, protomsg from testutils.infra.cli import CliTenantadm @@ -165,11 +167,23 @@ def test_websocket_reconnect(self, docker_env): # Test that mender-connect recovers if it loses the connection to deviceconnect. docker_env.restart_service("mender-deviceconnect") - time.sleep(10) + # mender-connect needs time to re-establish its session after + # deviceconnect restarts; until it does, the mgmt /connect endpoint + # returns HTTP 404 ("device disconnected"). Poll instead of a fixed + # sleep that races the reconnect. (QA-1527) + @retriable( + attempts=24, + sleeptime=5, + sleepscale=1, + jitter=0, + retry_exceptions=(WebSocketException,), + ) + def assert_websocket_connects(): + with docker_env.devconnect.get_websocket(): + # Connecting successfully is enough. + pass - with docker_env.devconnect.get_websocket(): - # Nothing to do, just connecting successfully is enough. - pass + assert_websocket_connects() def test_bogus_shell_message(self, docker_env): self.assert_env(docker_env) @@ -212,14 +226,50 @@ def detect_shell_prompt(shell): "$ ", ], "Could not detect shell prompt." - with docker_env.devconnect.get_websocket() as ws: - shell = proto_shell.ProtoShell(ws) - body = shell.startShell() - assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL - assert body == proto_shell.MSG_BODY_SHELL_STARTED - - detect_shell_prompt(shell) - is_shell_working(shell) + # Poll the full open-and-start-shell sequence until it succeeds. After a + # network outage the device reconnects to deviceconnect only after + # connection backoff and the session can flap, so neither a fixed sleep + # nor polling get_websocket() (which only checks the management side) + # is reliable: + # - the mgmt /connect endpoint returns HTTP 404 ("device + # disconnected") until the device side is back, and + # - a just-reconnected session may not answer startShell yet + # (recv times out). + # Retrying the whole sequence is the only signal that proves a working + # shell. 48 * 5s = 240s covers the worst-case recovery after the 128s + # drop below; retriable re-raises the last error if it never recovers. + # AssertionErrors are not retried, so a genuine protocol failure still + # fails fast. (QA-1527) + @retriable( + attempts=48, + sleeptime=5, + sleepscale=1, + jitter=0, + retry_exceptions=(WebSocketException, TimeoutError), + ) + def assert_working_shell(): + with docker_env.devconnect.get_websocket() as ws: + shell = proto_shell.ProtoShell(ws) + body = shell.startShell() + if ( + shell.protomsg.props["status"] != protomsg.PROP_STATUS_NORMAL + and body + and b"already running" in body + ): + # A shell started by a previous attempt that flapped mid-use + # may not be reaped yet (the shell limit is 1 per device). + # Treat it as not-ready and let the retry wait for the device + # to release it instead of failing on the assert below. + raise TimeoutError( + "shell from a previous attempt is still running" + ) + assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL + assert body == proto_shell.MSG_BODY_SHELL_STARTED + + detect_shell_prompt(shell) + is_shell_working(shell) + + assert_working_shell() docker_env.device.run("apt-get update") docker_env.device.run("apt-get install -y iptables") @@ -236,17 +286,21 @@ def detect_shell_prompt(shell): # Re-enable a good connection docker_env.device.run("iptables -D OUTPUT 1") - time.sleep(128) - # mender-connect should have "healed" now and be able to start a new shell - with docker_env.devconnect.get_websocket() as ws: - shell = proto_shell.ProtoShell(ws) - body = shell.startShell() - assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL - assert body == proto_shell.MSG_BODY_SHELL_STARTED + # mender-connect's reconnect backoff escalates per-attempt and caps at + # 30 minutes (see connectionmanager/exponentialbackoff.go in + # mender-connect); after a 128s outage the backoff timer can leave the + # next reconnect attempt minutes away, which is not testable in a CI + # window. Restart mender-connect so the fresh process starts at + # attempts=0 and reconnects on its first cycle. The entrypoint's + # supervise loop respawns it; this mirrors the canonical pattern used + # by test_filetransfer.update_limits(). (QA-1527, QA-1591) + docker_env.device.run( + "kill -TERM `pidof mender-connect` 2>/dev/null || true" + ) - detect_shell_prompt(shell) - is_shell_working(shell) + # Poll until a working shell can be opened end-to-end. + assert_working_shell() @flaky(max_runs=3) def test_session_recording(self, docker_env): From c61fe6b2e79aa00e047585e0867a5fd9f25c7629 Mon Sep 17 00:00:00 2001 From: pasinskim Date: Thu, 16 Apr 2026 20:30:09 +0200 Subject: [PATCH 2/8] chore(test): fix test_upload_limits_preserve_owner_and_group Remove the non-existent PreserveGroup config key. Mender-connect's PreserveOwner already covers both owner and group via os.Chown(uid, gid). The unknown JSON field was silently ignored. Also move the status code assertion before the file check so that upload failures (e.g. 408 timeout) are diagnosed directly instead of producing a confusing "assert '' == '101 102'" error. Signed-off-by: pasinskim --- tests/tests/test_filetransfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/tests/test_filetransfer.py b/tests/tests/test_filetransfer.py index 17221fc4b..7435c3e91 100644 --- a/tests/tests/test_filetransfer.py +++ b/tests/tests/test_filetransfer.py @@ -465,8 +465,8 @@ def test_upload_limits_preserve_owner_and_group(self, mender_device_setup): "FileTransfer": { "Chroot": "/var/lib/mender/filetransfer", "FollowSymLinks": True, # in the image /var/lib/mender is a symlink + # PreserveOwner covers both owner and group via os.Chown(uid, gid) "PreserveOwner": True, - "PreserveGroup": True, }, }, self.auth, @@ -490,12 +490,13 @@ def test_upload_limits_preserve_owner_and_group(self, mender_device_setup): gid=str(gid), ) + assert r.status_code == 201, r.json() + owner_group = self.mender_device.run( f"ls -aln /var/lib/mender/filetransfer/{fname}.bin | cut -f 3,4 -d' '" ) assert owner_group == str(uid) + " " + str(gid) + "\n" - assert r.status_code == 201 def assert_forbidden(self, rsp, message): try: From ecf6a5f6b4c125e140aee8e778f80bd8d24693b0 Mon Sep 17 00:00:00 2001 From: pasinskim Date: Thu, 16 Apr 2026 22:16:03 +0200 Subject: [PATCH 3/8] chore(test): cap SSH exponential backoff and remove 60s penalty The _run() SSH retry loop had three problems: - Slept before the first attempt (1s wasted if already connected) - Exponential backoff with no cap (1,2,4,8,16,32,64,128s...) - Extra time.sleep(60) on ConnectionError on top of the backoff After 7 failures the gap between retries was 128s; with the 60s penalty a single transient SSH drop could stall for minutes. Fix: try first then sleep, cap backoff at 30s, remove the redundant 60s ConnectionError penalty. Retry sequence is now 1,2,4,8,16,30,30... Signed-off-by: pasinskim --- testutils/infra/device.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/testutils/infra/device.py b/testutils/infra/device.py index 99fd4a32a..3d1bbe64f 100644 --- a/testutils/infra/device.py +++ b/testutils/infra/device.py @@ -427,19 +427,13 @@ def _run(conn, cmd, **kw): result = None start_time = time.time() sleeptime = 1 + max_sleeptime = 30 # cap to avoid multi-minute gaps between retries while time.time() < start_time + wait: - # Back off exponentially to save SSH handshakes in QEMU, which - # are quite expensive. - time.sleep(sleeptime) - sleeptime *= 2 - try: result = conn.run(cmd, **kw) break except ConnectionError as e: logger.info(f"Got SSH exception while connecting to host {conn.host}: {e}") - time.sleep(60) - continue except OSError as e: # The OSError is happening while there is no QEMU instance initialized logger.info( @@ -449,12 +443,17 @@ def _run(conn, cmd, **kw): ) if "Cannot assign requested address" not in str(e): raise e - continue except Exception as e: logger.exception( f"Generic exception happened while connecting to host {conn.host}" ) raise e + + # Back off exponentially to save SSH handshakes in QEMU, which + # are quite expensive. Capped so we never wait longer than + # max_sleeptime between attempts. + time.sleep(sleeptime) + sleeptime = min(sleeptime * 2, max_sleeptime) else: raise RuntimeError( f"Could not successfully run command after {wait} seconds on host {conn.host}: {cmd}" From 437738c18e24cc4b119bcceeed656c4d42842721 Mon Sep 17 00:00:00 2001 From: pasinskim Date: Thu, 16 Apr 2026 22:17:26 +0200 Subject: [PATCH 4/8] chore(test): fix deployment polling and remove unnecessary sleeps Four issues in deployments.py: 1. check_expected_statistics: slept 200ms before the first check on every iteration. With 59 call sites this adds up. 2. check_not_in_status: had a for/else logic bug where 'continue' targeted the inner for-loop instead of the outer while-loop, making the function silently return success in some edge cases. Replaced with straightforward any() check. 3. abort() and abort_finished_deployment(): both had a blind sleep(5) between the PUT response and the assertion on that same response's status code. This was completely pointless since r.status_code is already set. Saves 5s per call (4 call sites = 20s). Signed-off-by: pasinskim --- tests/MenderAPI/deployments.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/MenderAPI/deployments.py b/tests/MenderAPI/deployments.py index 6902e5a28..f099bc8b1 100644 --- a/tests/MenderAPI/deployments.py +++ b/tests/MenderAPI/deployments.py @@ -210,16 +210,14 @@ def check_not_in_status( while time.time() <= timeout: data = self.get_status(status=expected_status) - for deployment in data: - if deployment["id"] == deployment_id: - time.sleep(polling_frequency) - continue - else: + found = any(d["id"] == deployment_id for d in data) + if not found: logger.info( "left deployment status (%s) as expected for: %s" % (expected_status, deployment_id) ) return + time.sleep(polling_frequency) pytest.fail( "Never left status: %s for %s after %d seconds" @@ -238,8 +236,6 @@ def check_expected_statistics( seen = set() while time.time() <= timeout: - time.sleep(polling_frequency) - data = self.get_statistics(deployment_id) seen.add(str(data)) @@ -260,13 +256,12 @@ def check_expected_statistics( if data[expected_status] == expected_count: return - continue + time.sleep(polling_frequency) - if time.time() > timeout: - pytest.fail( - "Never found: %s:%s, only seen: %s after %d seconds" - % (expected_status, expected_count, str(seen), max_wait) - ) + pytest.fail( + "Never found: %s:%s, only seen: %s after %d seconds" + % (expected_status, expected_count, str(seen), max_wait) + ) def get_deployment_overview(self, deployment_id): deployments_overview_url = ( @@ -326,7 +321,6 @@ def abort(self, deployment_id): headers=self.auth.get_auth_token(), json={"status": "aborted"}, ) - time.sleep(5) assert r.status_code == requests.status_codes.codes.no_content def abort_finished_deployment(self, deployment_id): @@ -339,7 +333,6 @@ def abort_finished_deployment(self, deployment_id): headers=self.auth.get_auth_token(), json={"status": "aborted"}, ) - time.sleep(5) assert r.status_code == requests.status_codes.codes.unprocessable_entity def patch_deployment(self, deployment_id, update_control_map): From 30982269536ba112ed7beef133e5b9143f6b6a9c Mon Sep 17 00:00:00 2001 From: pasinskim Date: Thu, 16 Apr 2026 22:17:47 +0200 Subject: [PATCH 5/8] chore(test): faster log polling and retry on 503/504 _check_log_for_message: used a fixed 10s polling interval with sleep-before-check. If the log message appeared within 1s, the test still waited 10s. Replace with check-first and exponential backoff (2,4,8,10,10...) saving ~8s per call across 7 call sites. requests_retry: only retried on 500/502 but not 503 (Service Unavailable) or 504 (Gateway Timeout), which are common during container startup when the gateway is up but backends are still loading. Adding these prevents false test failures during setup. Signed-off-by: pasinskim --- tests/MenderAPI/requests_helpers.py | 4 ++-- tests/helpers.py | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/MenderAPI/requests_helpers.py b/tests/MenderAPI/requests_helpers.py index 1af167546..9829d5992 100644 --- a/tests/MenderAPI/requests_helpers.py +++ b/tests/MenderAPI/requests_helpers.py @@ -17,8 +17,8 @@ from requests.packages.urllib3.util.retry import Retry -# Will retry on 500 Server error -def requests_retry(status_forcelist=[500, 502]): +# Will retry on server errors (5xx) +def requests_retry(status_forcelist=[500, 502, 503, 504]): s = requests.Session() retries = Retry( total=5, diff --git a/tests/helpers.py b/tests/helpers.py index 6646da360..c073bd4c0 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,6 +16,8 @@ import json import time +import pytest + from .MenderAPI import devauth logger = logging.getLogger() @@ -68,9 +70,11 @@ def _check_log_for_message(device, message, since=None): # entries since the last service restart. cmd = f"systemctl status --no-pager --full --lines 100000 mender-authd" - sleepsec = 0 + sleeptime = 2 + max_sleeptime = 10 timeout = 600 - while sleepsec < timeout: + deadline = time.time() + timeout + while time.time() < deadline: out = device.run( cmd + "| grep '" + message + "'", warn=True, @@ -78,15 +82,16 @@ def _check_log_for_message(device, message, since=None): if out != "": return - time.sleep(10) - sleepsec += 10 + waited = int(timeout - (deadline - time.time())) logger.info( - f"waiting for message '{message}' in mender-authd log, waited for: {sleepsec}" + f"waiting for message '{message}' in mender-authd log, waited for: {waited}s" ) + time.sleep(sleeptime) + sleeptime = min(sleeptime * 2, max_sleeptime) - assert ( - sleepsec <= timeout - ), f"timeout for waiting for message '{message}' in mender-authd log" + pytest.fail( + f"timeout ({timeout}s) waiting for message '{message}' in mender-authd log" + ) @staticmethod def check_log_is_authenticated(device, since=None): From 6125b935c3933475c96f2eee202165d7838b4732 Mon Sep 17 00:00:00 2001 From: pasinskim Date: Fri, 17 Apr 2026 08:39:04 +0200 Subject: [PATCH 6/8] chore: reuse Docker Compose stacks across filetransfer tests Convert all 4 filetransfer test classes from function-scoped to class-scoped fixtures so each class spins up the ~21-container Docker Compose stack once instead of per-test. This saves ~80% of the wall time for the filetransfer suite (verified: 6 tests in 97s vs ~580s). Follows the existing _impl + class_persistent_ pattern already used for QEMU client fixtures. Signed-off-by: pasinskim --- tests/common_setup.py | 26 ++++++++++++++++++++++---- tests/tests/test_filetransfer.py | 30 ++++++++++++++++++------------ 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/tests/common_setup.py b/tests/common_setup.py index fff4e2159..9afa93d95 100644 --- a/tests/common_setup.py +++ b/tests/common_setup.py @@ -115,8 +115,7 @@ def standard_setup_one_rofs_client_bootstrapped(request): return env -@pytest.fixture(scope="function") -def standard_setup_one_docker_client_bootstrapped(request): +def standard_setup_one_docker_client_bootstrapped_impl(request): env = container_factory.get_docker_client_setup() request.addfinalizer(env.teardown) @@ -132,6 +131,16 @@ def standard_setup_one_docker_client_bootstrapped(request): return env +@pytest.fixture(scope="function") +def standard_setup_one_docker_client_bootstrapped(request): + return standard_setup_one_docker_client_bootstrapped_impl(request) + + +@pytest.fixture(scope="class") +def class_persistent_standard_setup_one_docker_client_bootstrapped(request): + return standard_setup_one_docker_client_bootstrapped_impl(request) + + @pytest.fixture(scope="function") def standard_setup_two_clients_bootstrapped(request): env = container_factory.get_standard_setup(num_clients=2) @@ -356,8 +365,7 @@ def enterprise_two_clients_bootstrapped(request): return env -@pytest.fixture(scope="function") -def enterprise_one_docker_client_bootstrapped(request): +def enterprise_one_docker_client_bootstrapped_impl(request): env = container_factory.get_enterprise_docker_client_setup(num_clients=0) request.addfinalizer(env.teardown) @@ -376,6 +384,16 @@ def enterprise_one_docker_client_bootstrapped(request): return env +@pytest.fixture(scope="function") +def enterprise_one_docker_client_bootstrapped(request): + return enterprise_one_docker_client_bootstrapped_impl(request) + + +@pytest.fixture(scope="class") +def class_persistent_enterprise_one_docker_client_bootstrapped(request): + return enterprise_one_docker_client_bootstrapped_impl(request) + + @pytest.fixture(scope="function") def enterprise_one_rofs_client_bootstrapped(request): env = container_factory.get_enterprise_rofs_client_setup(num_clients=0) diff --git a/tests/tests/test_filetransfer.py b/tests/tests/test_filetransfer.py index 7435c3e91..b960a860f 100644 --- a/tests/tests/test_filetransfer.py +++ b/tests/tests/test_filetransfer.py @@ -33,6 +33,8 @@ from ..common_setup import ( enterprise_one_docker_client_bootstrapped, standard_setup_one_docker_client_bootstrapped, + class_persistent_enterprise_one_docker_client_bootstrapped, + class_persistent_standard_setup_one_docker_client_bootstrapped, ) from ..MenderAPI import ( @@ -700,11 +702,11 @@ def rerun_on_timeouts(err, *args): class TestFileTransferDownloadOS(BaseTestFileTransferDownload): """Tests the file transfer functionality""" - @pytest.fixture(scope="function") + @pytest.fixture(scope="class") def mender_device_setup( - self, request, standard_setup_one_docker_client_bootstrapped + self, request, class_persistent_standard_setup_one_docker_client_bootstrapped ): - env = standard_setup_one_docker_client_bootstrapped + env = class_persistent_standard_setup_one_docker_client_bootstrapped request.cls.auth = env.auth request.cls.mender_device = env.device @@ -725,9 +727,11 @@ def test_download_ok(self, mender_device_setup, content_assertion=None): class TestFileTransferDownloadEnterprise(BaseTestFileTransferDownload): """Tests the file transfer functionality for enterprise setup""" - @pytest.fixture(scope="function") - def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped): - env = enterprise_one_docker_client_bootstrapped + @pytest.fixture(scope="class") + def mender_device_setup( + self, request, class_persistent_enterprise_one_docker_client_bootstrapped + ): + env = class_persistent_enterprise_one_docker_client_bootstrapped devid, auth_token, auth, mender_device = prepare_env_for_connect( env, docker=True, @@ -744,12 +748,12 @@ def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped class TestFileTransferLimitsOS(BaseTestFileTransferLimits): """Tests the file transfer functionality""" - @pytest.fixture(scope="function") + @pytest.fixture(scope="class") def mender_device_setup( - self, request, standard_setup_one_docker_client_bootstrapped + self, request, class_persistent_standard_setup_one_docker_client_bootstrapped ): - env = standard_setup_one_docker_client_bootstrapped + env = class_persistent_standard_setup_one_docker_client_bootstrapped request.cls.auth = env.auth request.cls.mender_device = env.device @@ -767,9 +771,11 @@ def mender_device_setup( class TestFileTransferLimitsEnterprise(BaseTestFileTransferLimits): """Tests the file transfer functionality for enterprise setup""" - @pytest.fixture(scope="function") - def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped): - env = enterprise_one_docker_client_bootstrapped + @pytest.fixture(scope="class") + def mender_device_setup( + self, request, class_persistent_enterprise_one_docker_client_bootstrapped + ): + env = class_persistent_enterprise_one_docker_client_bootstrapped devid, auth_token, auth, mender_device = prepare_env_for_connect( env, docker=True, From cad2adc82cb6ce6c0f8ec310a1a4acb6d4f1fdbd Mon Sep 17 00:00:00 2001 From: pasinskim Date: Fri, 17 Apr 2026 12:20:29 +0200 Subject: [PATCH 7/8] chore(test): remove obsolete Mender v1.7.0 migration tests Remove test_db_migration.py and its v1 legacy client fixtures from common_setup.py. These tests verified upgrading from Mender v1.7.0 (released 2017) to the current version. This migration path that is no longer supported. The fixtures were only used by this test file. Signed-off-by: pasinskim --- tests/common_setup.py | 44 ----- tests/tests/test_db_migration.py | 272 ------------------------------- 2 files changed, 316 deletions(-) delete mode 100644 tests/tests/test_db_migration.py diff --git a/tests/common_setup.py b/tests/common_setup.py index 9afa93d95..ac683766f 100644 --- a/tests/common_setup.py +++ b/tests/common_setup.py @@ -169,30 +169,6 @@ def standard_setup_without_client(request): return env -@pytest.fixture(scope="function") -def setup_with_legacy_v1_client(request): - # The legacy 1.7.0 client was only built for qemux86-64, so skip tests using - # it when running other platforms. - if conftest.machine_name != "qemux86-64": - pytest.skip( - "Test only works with qemux86-64, and this is %s" % conftest.machine_name - ) - - env = container_factory.get_legacy_v1_client_setup() - request.addfinalizer(env.teardown) - - env.setup() - - env.device = MenderDevice(env.get_mender_clients()[0]) - env.device.ssh_is_opened() - - reset_mender_api(env) - devauth.accept_devices(1) - - env.auth = auth - return env - - @pytest.fixture(scope="function") def setup_with_legacy_v3_client(request): env = container_factory.get_legacy_v3_client_setup() @@ -496,26 +472,6 @@ def enterprise_with_short_lived_token(request): return env -@pytest.fixture(scope="function") -def enterprise_with_legacy_v1_client(request): - env = container_factory.get_enterprise_legacy_v1_client_setup(num_clients=0) - request.addfinalizer(env.teardown) - - env.setup() - reset_mender_api(env) - - tenant = create_tenant(env) - new_tenant_client(env, "mender-client", tenant["tenant_token"]) - env.device_group.ssh_is_opened() - - devauth_tenant = DeviceAuthV2(env.auth) - devauth_tenant.accept_devices(1) - devices = devauth_tenant.get_devices_status("accepted") - assert 1 == len(devices) - - return env - - @pytest.fixture(scope="function") def enterprise_with_legacy_v3_client(request): env = container_factory.get_enterprise_legacy_v3_client_setup(num_clients=0) diff --git a/tests/tests/test_db_migration.py b/tests/tests/test_db_migration.py deleted file mode 100644 index 004a4c14b..000000000 --- a/tests/tests/test_db_migration.py +++ /dev/null @@ -1,272 +0,0 @@ -# Copyright 2022 Northern.tech AS -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import pytest -import shutil -import tempfile - -from ..common_setup import ( - setup_with_legacy_v1_client, - enterprise_with_legacy_v1_client, -) -from .common_update import update_image, common_update_procedure -from ..MenderAPI import DeviceAuthV2, Deployments, logger -from .mendertesting import MenderTesting - - -class BaseTestDBMigration(MenderTesting): - def ensure_persistent_conf_script(self, dir): - # Because older versions of Yocto branches did not split mender.conf - # into /etc/mender/mender.conf and /data/mender/mender.conf, we need to - # provide the content of the second file ourselves. - name = os.path.join(dir, "ArtifactInstall_Enter_00_ensure_persistent_conf") - with open(name, "w") as fd: - fd.write("""#!/bin/sh - -set -e - -if ! [ -f /data/mender/mender.conf ]; then - ( - echo '{' - grep RootfsPart /etc/mender/mender.conf |sed -e '${s/,$//}' - echo '}' - ) > /data/mender/mender.conf -fi -exit 0 -""") - return name - - def generate_storage_device_state_scripts(self, dir): - # Older versions of our mender-client-qemu image had /dev/hda as their - # storage, in kirkstone, this switched to /dev/sda, so we need to make - # this conversion both when upgrading, and rolling back. - - content = """#!/bin/sh - -detect_image_type_on_passive() { - # Sanity check that this is a Poky build. - if ! grep Poky "$1/etc/os-release" > /dev/null; then - echo "This test is not adapted to non-Poky builds!" 1>&2 - exit 127 - fi - - eval "$(grep '^VERSION_ID=' "$1/etc/os-release")" - printf '%s\n%s\n' "$VERSION_ID" 3.5 > /tmp/versions.txt - # If the smallest is 3.5, it means VERSION_ID is higher or equal, which - # means kirkstone or higher. - if [ "$(sort -V /tmp/versions.txt | head -n 1)" = "3.5" ]; then - echo "/dev/sda" - else - echo "/dev/hda" - fi -} - -if mount | grep "2 on / "; then - eval $(printf PASSIVE=%s /dev/[hs]da3) -else - eval $(printf PASSIVE=%s /dev/[hs]da2) -fi - -mount "$PASSIVE" /mnt -DEV="$(detect_image_type_on_passive /mnt)" -umount /mnt - -for file in /data/mender/mender.conf $(find /boot/efi/ -name grub.cfg); do - sed -i -e "s,/dev/[hs]da,$DEV,g" "$file" -done -""" - - scripts = [ - os.path.join(dir, "ArtifactInstall_Leave_10_storage_device"), - os.path.join(dir, "ArtifactRollback_Leave_10_storage_device"), - ] - for script in scripts: - with open(script, "w") as fd: - fd.write(content) - return scripts - - def do_test_migrate_from_legacy_mender_v1_failure( - self, env, valid_image_with_mender_conf - ): - """ - Start a legacy client (1.7.0) first and update it to the new one. - - The test starts a setup with the 1.7.0 client and then updates it to - the current version. The update is failing first (due to failure - returned inside the artifact commit enter state script). - After the failed first update we are updating cient (1.7.0) again, - and this time the update should succeed. - """ - - mender_device = env.device - devauth = DeviceAuthV2(env.auth) - deploy = Deployments(env.auth, devauth) - - dirpath = tempfile.mkdtemp() - script_content = "#!/bin/sh\nexit 1\n" - with open(os.path.join(dirpath, "ArtifactCommit_Enter_01"), "w") as fd: - fd.write(script_content) - - active_part = mender_device.get_active_partition() - - ensure_persistent_conf = self.ensure_persistent_conf_script(dirpath) - storage_device_state_scripts = self.generate_storage_device_state_scripts( - dirpath - ) - - mender_conf = mender_device.run("cat /etc/mender/mender.conf") - mender_conf_json = json.loads(mender_conf) - # Delete these, we want the persistent_conf above to take effect. - del mender_conf_json["RootfsPartA"] - del mender_conf_json["RootfsPartB"] - valid_image = valid_image_with_mender_conf(json.dumps(mender_conf_json)) - - # first start with the failed update - host_ip = env.get_virtual_network_host_ip() - with mender_device.get_reboot_detector(host_ip) as reboot: - deployment_id, _ = common_update_procedure( - valid_image, - scripts=[ - ensure_persistent_conf, - os.path.join(dirpath, "ArtifactCommit_Enter_01"), - ] - + storage_device_state_scripts, - version=2, - devauth=devauth, - deploy=deploy, - ) - - logger.info("waiting for system to reboot twice") - reboot.verify_reboot_performed(number_of_reboots=2) - - assert mender_device.get_active_partition() == active_part - deploy.check_expected_statistics(deployment_id, "failure", 1) - - # do the next update, this time successful - update_image( - mender_device, - host_ip, - scripts=[ensure_persistent_conf] + storage_device_state_scripts, - install_image=valid_image, - version=2, - devauth=devauth, - deploy=deploy, - ) - - def do_test_migrate_from_legacy_mender_v1_success( - self, env, valid_image_with_mender_conf - ): - """ - Start a legacy client (1.7.0) first and update it to the new one. - - The test starts a setup with the 1.7.0 client and then updates it to - the current version. After the first successful update, we are updating - the client for the second time, to make sure the DB migration has not left - any traces in the database that are causing issues. - """ - - mender_device = env.device - devauth = DeviceAuthV2(env.auth) - deploy = Deployments(env.auth, devauth) - - tmpdir = tempfile.mkdtemp() - test_log = "/var/lib/mender/migration_state_scripts.log" - try: - ensure_persistent_conf = self.ensure_persistent_conf_script(tmpdir) - storage_device_state_scripts = self.generate_storage_device_state_scripts( - tmpdir - ) - - # Test that state scripts are also executed correctly. - scripts = ["ArtifactInstall_Enter_00", "ArtifactCommit_Enter_00"] - scripts_paths = [] - for script in scripts: - script_path = os.path.join(tmpdir, script) - scripts_paths += [script_path] - with open(script_path, "w") as fd: - fd.write("#!/bin/sh\necho $(basename $0) >> %s\n" % test_log) - - mender_conf = mender_device.run("cat /etc/mender/mender.conf") - mender_conf_json = json.loads(mender_conf) - # Delete these, we want the persistent_conf above to take effect. - del mender_conf_json["RootfsPartA"] - del mender_conf_json["RootfsPartB"] - valid_image = valid_image_with_mender_conf(json.dumps(mender_conf_json)) - - # do the successful update twice - host_ip = env.get_virtual_network_host_ip() - update_image( - mender_device, - host_ip, - install_image=valid_image, - scripts=[ensure_persistent_conf] - + storage_device_state_scripts - + scripts_paths, - version=2, - devauth=devauth, - deploy=deploy, - ) - assert mender_device.run("cat %s" % test_log).strip() == "\n".join(scripts) - - # NOTE: With client >= 4.x we only support Artifact version 3 - update_image( - mender_device, - host_ip, - install_image=valid_image, - # Second update should not need storage_device_state_scripts. - scripts=[ensure_persistent_conf] + scripts_paths, - version=3, - devauth=devauth, - deploy=deploy, - ) - assert mender_device.run("cat %s" % test_log).strip() == "\n".join( - scripts - ) + "\n" + "\n".join(scripts) - - finally: - shutil.rmtree(tmpdir) - - -class TestDBMigrationOpenSource(BaseTestDBMigration): - def test_migrate_from_legacy_mender_v1_failure( - self, setup_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_failure( - setup_with_legacy_v1_client, valid_image_with_mender_conf - ) - - def test_migrate_from_legacy_mender_v1_success( - self, setup_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_success( - setup_with_legacy_v1_client, valid_image_with_mender_conf - ) - - -class TestDBMigrationEnterprise(BaseTestDBMigration): - def test_migrate_from_legacy_mender_v1_failure( - self, enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_failure( - enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ) - - def test_migrate_from_legacy_mender_v1_success( - self, enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_success( - enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ) From 13da8cd900c0c6630b3f7b317963c9ef3857cbec Mon Sep 17 00:00:00 2001 From: pasinskim Date: Wed, 27 May 2026 22:58:03 +0200 Subject: [PATCH 8/8] chore(test): poll for alert store expiry instead of fixed sleeps test_monitorclient_remove_old_alerts read the alert store once at a fixed offset after starting mender-monitor and asserted an exact key count. mender-monitor purges expired records on a periodic loop with variable restart latency, so the single timed read raced the purge cycle. The faster mender_device.run() from the SSH backoff change (no longer sleeping 1s before every healthy command) shifted the read ~2s earlier and collapsed the timing margin, surfacing the latent flakiness (seen as '4' == '2' and '1' == '0'). Poll until the store drains to the expected count instead. It drains monotonically (4 -> 2 -> 0), so 1s polling reliably observes each plateau regardless of purge-loop latency. Signed-off-by: pasinskim --- tests/tests/test_monitor_client.py | 55 ++++++++++++++++++------------ 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/tests/tests/test_monitor_client.py b/tests/tests/test_monitor_client.py index 6c7905499..6a2ff6d4e 100644 --- a/tests/tests/test_monitor_client.py +++ b/tests/tests/test_monitor_client.py @@ -1593,30 +1593,41 @@ def test_monitorclient_remove_old_alerts(self, monitor_commercial_setup_no_clien ) mender_device.run("systemctl start mender-monitor") - # T8: mender-monitor started - time.sleep(alert_resend_interval_s) - time.sleep(alert_resend_interval_s) - - # Shift by 1s to avoid race condition when checking - time.sleep(1) + def count_alert_keys(): + output = mender_device.run( + "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" + + "keys_nolock | wc -l;'" + ) + return int(output.strip()) + + def wait_for_alert_count(expected, max_wait_s): + # mender-monitor purges expired records on a periodic loop + # (DEFAULT_ALERT_STORE_RESEND_INTERVAL_S) whose restart latency + # varies, so poll for the expected count instead of reading once at a + # fixed offset, which races the purge cycle. The store drains + # monotonically (4 -> 2 -> 0), so 1s polling reliably observes each + # plateau. (QA-1527) + deadline = time.monotonic() + max_wait_s + count = None + while time.monotonic() < deadline: + count = count_alert_keys() + logger.info( + "test_monitorclient_remove_old_alerts: %d keys in store" % count + ) + if count == expected: + return + time.sleep(1) + assert count == expected, "expected %d alert keys in store, got %s" % ( + expected, + count, + ) - # T16+1: key1, key2 expired - output = mender_device.run( - "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" - + "keys_nolock | wc -l;'" - ) - logger.info("test_monitorclient_remove_old_alerts got %s keys" % output) - assert output == "2\n" + # key1, key2 expire at alert_max_age, ~8s before key3, key4 (inserted 8s + # later), leaving a wide window where exactly 2 keys remain. + wait_for_alert_count(2, max_wait_s=2 * alert_max_age) - time.sleep(alert_resend_interval_s) - time.sleep(alert_resend_interval_s) - # T24+1: key3, key4 expired - output = mender_device.run( - "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" - + "keys_nolock | wc -l;'" - ) - logger.info("test_monitorclient_remove_old_alerts got %s keys" % output) - assert output == "0\n" + # key3, key4 expire ~8s after key1, key2, draining the store completely. + wait_for_alert_count(0, max_wait_s=2 * alert_max_age) mender_device.run( "mv /usr/share/mender-monitor/config/config.sh.backup /usr/share/mender-monitor/config/config.sh"