diff --git a/tests/MenderAPI/deployments.py b/tests/MenderAPI/deployments.py index 6902e5a28..f099bc8b1 100644 --- a/tests/MenderAPI/deployments.py +++ b/tests/MenderAPI/deployments.py @@ -210,16 +210,14 @@ def check_not_in_status( while time.time() <= timeout: data = self.get_status(status=expected_status) - for deployment in data: - if deployment["id"] == deployment_id: - time.sleep(polling_frequency) - continue - else: + found = any(d["id"] == deployment_id for d in data) + if not found: logger.info( "left deployment status (%s) as expected for: %s" % (expected_status, deployment_id) ) return + time.sleep(polling_frequency) pytest.fail( "Never left status: %s for %s after %d seconds" @@ -238,8 +236,6 @@ def check_expected_statistics( seen = set() while time.time() <= timeout: - time.sleep(polling_frequency) - data = self.get_statistics(deployment_id) seen.add(str(data)) @@ -260,13 +256,12 @@ def check_expected_statistics( if data[expected_status] == expected_count: return - continue + time.sleep(polling_frequency) - if time.time() > timeout: - pytest.fail( - "Never found: %s:%s, only seen: %s after %d seconds" - % (expected_status, expected_count, str(seen), max_wait) - ) + pytest.fail( + "Never found: %s:%s, only seen: %s after %d seconds" + % (expected_status, expected_count, str(seen), max_wait) + ) def get_deployment_overview(self, deployment_id): deployments_overview_url = ( @@ -326,7 +321,6 @@ def abort(self, deployment_id): headers=self.auth.get_auth_token(), json={"status": "aborted"}, ) - time.sleep(5) assert r.status_code == requests.status_codes.codes.no_content def abort_finished_deployment(self, deployment_id): @@ -339,7 +333,6 @@ def abort_finished_deployment(self, deployment_id): headers=self.auth.get_auth_token(), json={"status": "aborted"}, ) - time.sleep(5) assert r.status_code == requests.status_codes.codes.unprocessable_entity def patch_deployment(self, deployment_id, update_control_map): diff --git a/tests/MenderAPI/requests_helpers.py b/tests/MenderAPI/requests_helpers.py index 1af167546..9829d5992 100644 --- a/tests/MenderAPI/requests_helpers.py +++ b/tests/MenderAPI/requests_helpers.py @@ -17,8 +17,8 @@ from requests.packages.urllib3.util.retry import Retry -# Will retry on 500 Server error -def requests_retry(status_forcelist=[500, 502]): +# Will retry on server errors (5xx) +def requests_retry(status_forcelist=[500, 502, 503, 504]): s = requests.Session() retries = Retry( total=5, diff --git a/tests/common_setup.py b/tests/common_setup.py index fff4e2159..ac683766f 100644 --- a/tests/common_setup.py +++ b/tests/common_setup.py @@ -115,8 +115,7 @@ def standard_setup_one_rofs_client_bootstrapped(request): return env -@pytest.fixture(scope="function") -def standard_setup_one_docker_client_bootstrapped(request): +def standard_setup_one_docker_client_bootstrapped_impl(request): env = container_factory.get_docker_client_setup() request.addfinalizer(env.teardown) @@ -132,6 +131,16 @@ def standard_setup_one_docker_client_bootstrapped(request): return env +@pytest.fixture(scope="function") +def standard_setup_one_docker_client_bootstrapped(request): + return standard_setup_one_docker_client_bootstrapped_impl(request) + + +@pytest.fixture(scope="class") +def class_persistent_standard_setup_one_docker_client_bootstrapped(request): + return standard_setup_one_docker_client_bootstrapped_impl(request) + + @pytest.fixture(scope="function") def standard_setup_two_clients_bootstrapped(request): env = container_factory.get_standard_setup(num_clients=2) @@ -160,30 +169,6 @@ def standard_setup_without_client(request): return env -@pytest.fixture(scope="function") -def setup_with_legacy_v1_client(request): - # The legacy 1.7.0 client was only built for qemux86-64, so skip tests using - # it when running other platforms. - if conftest.machine_name != "qemux86-64": - pytest.skip( - "Test only works with qemux86-64, and this is %s" % conftest.machine_name - ) - - env = container_factory.get_legacy_v1_client_setup() - request.addfinalizer(env.teardown) - - env.setup() - - env.device = MenderDevice(env.get_mender_clients()[0]) - env.device.ssh_is_opened() - - reset_mender_api(env) - devauth.accept_devices(1) - - env.auth = auth - return env - - @pytest.fixture(scope="function") def setup_with_legacy_v3_client(request): env = container_factory.get_legacy_v3_client_setup() @@ -356,8 +341,7 @@ def enterprise_two_clients_bootstrapped(request): return env -@pytest.fixture(scope="function") -def enterprise_one_docker_client_bootstrapped(request): +def enterprise_one_docker_client_bootstrapped_impl(request): env = container_factory.get_enterprise_docker_client_setup(num_clients=0) request.addfinalizer(env.teardown) @@ -376,6 +360,16 @@ def enterprise_one_docker_client_bootstrapped(request): return env +@pytest.fixture(scope="function") +def enterprise_one_docker_client_bootstrapped(request): + return enterprise_one_docker_client_bootstrapped_impl(request) + + +@pytest.fixture(scope="class") +def class_persistent_enterprise_one_docker_client_bootstrapped(request): + return enterprise_one_docker_client_bootstrapped_impl(request) + + @pytest.fixture(scope="function") def enterprise_one_rofs_client_bootstrapped(request): env = container_factory.get_enterprise_rofs_client_setup(num_clients=0) @@ -478,26 +472,6 @@ def enterprise_with_short_lived_token(request): return env -@pytest.fixture(scope="function") -def enterprise_with_legacy_v1_client(request): - env = container_factory.get_enterprise_legacy_v1_client_setup(num_clients=0) - request.addfinalizer(env.teardown) - - env.setup() - reset_mender_api(env) - - tenant = create_tenant(env) - new_tenant_client(env, "mender-client", tenant["tenant_token"]) - env.device_group.ssh_is_opened() - - devauth_tenant = DeviceAuthV2(env.auth) - devauth_tenant.accept_devices(1) - devices = devauth_tenant.get_devices_status("accepted") - assert 1 == len(devices) - - return env - - @pytest.fixture(scope="function") def enterprise_with_legacy_v3_client(request): env = container_factory.get_enterprise_legacy_v3_client_setup(num_clients=0) diff --git a/tests/helpers.py b/tests/helpers.py index 6646da360..c073bd4c0 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,6 +16,8 @@ import json import time +import pytest + from .MenderAPI import devauth logger = logging.getLogger() @@ -68,9 +70,11 @@ def _check_log_for_message(device, message, since=None): # entries since the last service restart. cmd = f"systemctl status --no-pager --full --lines 100000 mender-authd" - sleepsec = 0 + sleeptime = 2 + max_sleeptime = 10 timeout = 600 - while sleepsec < timeout: + deadline = time.time() + timeout + while time.time() < deadline: out = device.run( cmd + "| grep '" + message + "'", warn=True, @@ -78,15 +82,16 @@ def _check_log_for_message(device, message, since=None): if out != "": return - time.sleep(10) - sleepsec += 10 + waited = int(timeout - (deadline - time.time())) logger.info( - f"waiting for message '{message}' in mender-authd log, waited for: {sleepsec}" + f"waiting for message '{message}' in mender-authd log, waited for: {waited}s" ) + time.sleep(sleeptime) + sleeptime = min(sleeptime * 2, max_sleeptime) - assert ( - sleepsec <= timeout - ), f"timeout for waiting for message '{message}' in mender-authd log" + pytest.fail( + f"timeout ({timeout}s) waiting for message '{message}' in mender-authd log" + ) @staticmethod def check_log_is_authenticated(device, since=None): diff --git a/tests/tests/test_db_migration.py b/tests/tests/test_db_migration.py deleted file mode 100644 index 004a4c14b..000000000 --- a/tests/tests/test_db_migration.py +++ /dev/null @@ -1,272 +0,0 @@ -# Copyright 2022 Northern.tech AS -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import pytest -import shutil -import tempfile - -from ..common_setup import ( - setup_with_legacy_v1_client, - enterprise_with_legacy_v1_client, -) -from .common_update import update_image, common_update_procedure -from ..MenderAPI import DeviceAuthV2, Deployments, logger -from .mendertesting import MenderTesting - - -class BaseTestDBMigration(MenderTesting): - def ensure_persistent_conf_script(self, dir): - # Because older versions of Yocto branches did not split mender.conf - # into /etc/mender/mender.conf and /data/mender/mender.conf, we need to - # provide the content of the second file ourselves. - name = os.path.join(dir, "ArtifactInstall_Enter_00_ensure_persistent_conf") - with open(name, "w") as fd: - fd.write("""#!/bin/sh - -set -e - -if ! [ -f /data/mender/mender.conf ]; then - ( - echo '{' - grep RootfsPart /etc/mender/mender.conf |sed -e '${s/,$//}' - echo '}' - ) > /data/mender/mender.conf -fi -exit 0 -""") - return name - - def generate_storage_device_state_scripts(self, dir): - # Older versions of our mender-client-qemu image had /dev/hda as their - # storage, in kirkstone, this switched to /dev/sda, so we need to make - # this conversion both when upgrading, and rolling back. - - content = """#!/bin/sh - -detect_image_type_on_passive() { - # Sanity check that this is a Poky build. - if ! grep Poky "$1/etc/os-release" > /dev/null; then - echo "This test is not adapted to non-Poky builds!" 1>&2 - exit 127 - fi - - eval "$(grep '^VERSION_ID=' "$1/etc/os-release")" - printf '%s\n%s\n' "$VERSION_ID" 3.5 > /tmp/versions.txt - # If the smallest is 3.5, it means VERSION_ID is higher or equal, which - # means kirkstone or higher. - if [ "$(sort -V /tmp/versions.txt | head -n 1)" = "3.5" ]; then - echo "/dev/sda" - else - echo "/dev/hda" - fi -} - -if mount | grep "2 on / "; then - eval $(printf PASSIVE=%s /dev/[hs]da3) -else - eval $(printf PASSIVE=%s /dev/[hs]da2) -fi - -mount "$PASSIVE" /mnt -DEV="$(detect_image_type_on_passive /mnt)" -umount /mnt - -for file in /data/mender/mender.conf $(find /boot/efi/ -name grub.cfg); do - sed -i -e "s,/dev/[hs]da,$DEV,g" "$file" -done -""" - - scripts = [ - os.path.join(dir, "ArtifactInstall_Leave_10_storage_device"), - os.path.join(dir, "ArtifactRollback_Leave_10_storage_device"), - ] - for script in scripts: - with open(script, "w") as fd: - fd.write(content) - return scripts - - def do_test_migrate_from_legacy_mender_v1_failure( - self, env, valid_image_with_mender_conf - ): - """ - Start a legacy client (1.7.0) first and update it to the new one. - - The test starts a setup with the 1.7.0 client and then updates it to - the current version. The update is failing first (due to failure - returned inside the artifact commit enter state script). - After the failed first update we are updating cient (1.7.0) again, - and this time the update should succeed. - """ - - mender_device = env.device - devauth = DeviceAuthV2(env.auth) - deploy = Deployments(env.auth, devauth) - - dirpath = tempfile.mkdtemp() - script_content = "#!/bin/sh\nexit 1\n" - with open(os.path.join(dirpath, "ArtifactCommit_Enter_01"), "w") as fd: - fd.write(script_content) - - active_part = mender_device.get_active_partition() - - ensure_persistent_conf = self.ensure_persistent_conf_script(dirpath) - storage_device_state_scripts = self.generate_storage_device_state_scripts( - dirpath - ) - - mender_conf = mender_device.run("cat /etc/mender/mender.conf") - mender_conf_json = json.loads(mender_conf) - # Delete these, we want the persistent_conf above to take effect. - del mender_conf_json["RootfsPartA"] - del mender_conf_json["RootfsPartB"] - valid_image = valid_image_with_mender_conf(json.dumps(mender_conf_json)) - - # first start with the failed update - host_ip = env.get_virtual_network_host_ip() - with mender_device.get_reboot_detector(host_ip) as reboot: - deployment_id, _ = common_update_procedure( - valid_image, - scripts=[ - ensure_persistent_conf, - os.path.join(dirpath, "ArtifactCommit_Enter_01"), - ] - + storage_device_state_scripts, - version=2, - devauth=devauth, - deploy=deploy, - ) - - logger.info("waiting for system to reboot twice") - reboot.verify_reboot_performed(number_of_reboots=2) - - assert mender_device.get_active_partition() == active_part - deploy.check_expected_statistics(deployment_id, "failure", 1) - - # do the next update, this time successful - update_image( - mender_device, - host_ip, - scripts=[ensure_persistent_conf] + storage_device_state_scripts, - install_image=valid_image, - version=2, - devauth=devauth, - deploy=deploy, - ) - - def do_test_migrate_from_legacy_mender_v1_success( - self, env, valid_image_with_mender_conf - ): - """ - Start a legacy client (1.7.0) first and update it to the new one. - - The test starts a setup with the 1.7.0 client and then updates it to - the current version. After the first successful update, we are updating - the client for the second time, to make sure the DB migration has not left - any traces in the database that are causing issues. - """ - - mender_device = env.device - devauth = DeviceAuthV2(env.auth) - deploy = Deployments(env.auth, devauth) - - tmpdir = tempfile.mkdtemp() - test_log = "/var/lib/mender/migration_state_scripts.log" - try: - ensure_persistent_conf = self.ensure_persistent_conf_script(tmpdir) - storage_device_state_scripts = self.generate_storage_device_state_scripts( - tmpdir - ) - - # Test that state scripts are also executed correctly. - scripts = ["ArtifactInstall_Enter_00", "ArtifactCommit_Enter_00"] - scripts_paths = [] - for script in scripts: - script_path = os.path.join(tmpdir, script) - scripts_paths += [script_path] - with open(script_path, "w") as fd: - fd.write("#!/bin/sh\necho $(basename $0) >> %s\n" % test_log) - - mender_conf = mender_device.run("cat /etc/mender/mender.conf") - mender_conf_json = json.loads(mender_conf) - # Delete these, we want the persistent_conf above to take effect. - del mender_conf_json["RootfsPartA"] - del mender_conf_json["RootfsPartB"] - valid_image = valid_image_with_mender_conf(json.dumps(mender_conf_json)) - - # do the successful update twice - host_ip = env.get_virtual_network_host_ip() - update_image( - mender_device, - host_ip, - install_image=valid_image, - scripts=[ensure_persistent_conf] - + storage_device_state_scripts - + scripts_paths, - version=2, - devauth=devauth, - deploy=deploy, - ) - assert mender_device.run("cat %s" % test_log).strip() == "\n".join(scripts) - - # NOTE: With client >= 4.x we only support Artifact version 3 - update_image( - mender_device, - host_ip, - install_image=valid_image, - # Second update should not need storage_device_state_scripts. - scripts=[ensure_persistent_conf] + scripts_paths, - version=3, - devauth=devauth, - deploy=deploy, - ) - assert mender_device.run("cat %s" % test_log).strip() == "\n".join( - scripts - ) + "\n" + "\n".join(scripts) - - finally: - shutil.rmtree(tmpdir) - - -class TestDBMigrationOpenSource(BaseTestDBMigration): - def test_migrate_from_legacy_mender_v1_failure( - self, setup_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_failure( - setup_with_legacy_v1_client, valid_image_with_mender_conf - ) - - def test_migrate_from_legacy_mender_v1_success( - self, setup_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_success( - setup_with_legacy_v1_client, valid_image_with_mender_conf - ) - - -class TestDBMigrationEnterprise(BaseTestDBMigration): - def test_migrate_from_legacy_mender_v1_failure( - self, enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_failure( - enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ) - - def test_migrate_from_legacy_mender_v1_success( - self, enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ): - self.do_test_migrate_from_legacy_mender_v1_success( - enterprise_with_legacy_v1_client, valid_image_with_mender_conf - ) diff --git a/tests/tests/test_filetransfer.py b/tests/tests/test_filetransfer.py index 17221fc4b..b960a860f 100644 --- a/tests/tests/test_filetransfer.py +++ b/tests/tests/test_filetransfer.py @@ -33,6 +33,8 @@ from ..common_setup import ( enterprise_one_docker_client_bootstrapped, standard_setup_one_docker_client_bootstrapped, + class_persistent_enterprise_one_docker_client_bootstrapped, + class_persistent_standard_setup_one_docker_client_bootstrapped, ) from ..MenderAPI import ( @@ -465,8 +467,8 @@ def test_upload_limits_preserve_owner_and_group(self, mender_device_setup): "FileTransfer": { "Chroot": "/var/lib/mender/filetransfer", "FollowSymLinks": True, # in the image /var/lib/mender is a symlink + # PreserveOwner covers both owner and group via os.Chown(uid, gid) "PreserveOwner": True, - "PreserveGroup": True, }, }, self.auth, @@ -490,12 +492,13 @@ def test_upload_limits_preserve_owner_and_group(self, mender_device_setup): gid=str(gid), ) + assert r.status_code == 201, r.json() + owner_group = self.mender_device.run( f"ls -aln /var/lib/mender/filetransfer/{fname}.bin | cut -f 3,4 -d' '" ) assert owner_group == str(uid) + " " + str(gid) + "\n" - assert r.status_code == 201 def assert_forbidden(self, rsp, message): try: @@ -699,11 +702,11 @@ def rerun_on_timeouts(err, *args): class TestFileTransferDownloadOS(BaseTestFileTransferDownload): """Tests the file transfer functionality""" - @pytest.fixture(scope="function") + @pytest.fixture(scope="class") def mender_device_setup( - self, request, standard_setup_one_docker_client_bootstrapped + self, request, class_persistent_standard_setup_one_docker_client_bootstrapped ): - env = standard_setup_one_docker_client_bootstrapped + env = class_persistent_standard_setup_one_docker_client_bootstrapped request.cls.auth = env.auth request.cls.mender_device = env.device @@ -724,9 +727,11 @@ def test_download_ok(self, mender_device_setup, content_assertion=None): class TestFileTransferDownloadEnterprise(BaseTestFileTransferDownload): """Tests the file transfer functionality for enterprise setup""" - @pytest.fixture(scope="function") - def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped): - env = enterprise_one_docker_client_bootstrapped + @pytest.fixture(scope="class") + def mender_device_setup( + self, request, class_persistent_enterprise_one_docker_client_bootstrapped + ): + env = class_persistent_enterprise_one_docker_client_bootstrapped devid, auth_token, auth, mender_device = prepare_env_for_connect( env, docker=True, @@ -743,12 +748,12 @@ def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped class TestFileTransferLimitsOS(BaseTestFileTransferLimits): """Tests the file transfer functionality""" - @pytest.fixture(scope="function") + @pytest.fixture(scope="class") def mender_device_setup( - self, request, standard_setup_one_docker_client_bootstrapped + self, request, class_persistent_standard_setup_one_docker_client_bootstrapped ): - env = standard_setup_one_docker_client_bootstrapped + env = class_persistent_standard_setup_one_docker_client_bootstrapped request.cls.auth = env.auth request.cls.mender_device = env.device @@ -766,9 +771,11 @@ def mender_device_setup( class TestFileTransferLimitsEnterprise(BaseTestFileTransferLimits): """Tests the file transfer functionality for enterprise setup""" - @pytest.fixture(scope="function") - def mender_device_setup(self, request, enterprise_one_docker_client_bootstrapped): - env = enterprise_one_docker_client_bootstrapped + @pytest.fixture(scope="class") + def mender_device_setup( + self, request, class_persistent_enterprise_one_docker_client_bootstrapped + ): + env = class_persistent_enterprise_one_docker_client_bootstrapped devid, auth_token, auth, mender_device = prepare_env_for_connect( env, docker=True, diff --git a/tests/tests/test_mender_connect.py b/tests/tests/test_mender_connect.py index 329afb8dc..4948d2a95 100644 --- a/tests/tests/test_mender_connect.py +++ b/tests/tests/test_mender_connect.py @@ -18,6 +18,8 @@ import uuid from flaky import flaky +from redo import retriable +from websockets.exceptions import WebSocketException from testutils.api import proto_shell, protomsg from testutils.infra.cli import CliTenantadm @@ -165,11 +167,23 @@ def test_websocket_reconnect(self, docker_env): # Test that mender-connect recovers if it loses the connection to deviceconnect. docker_env.restart_service("mender-deviceconnect") - time.sleep(10) + # mender-connect needs time to re-establish its session after + # deviceconnect restarts; until it does, the mgmt /connect endpoint + # returns HTTP 404 ("device disconnected"). Poll instead of a fixed + # sleep that races the reconnect. (QA-1527) + @retriable( + attempts=24, + sleeptime=5, + sleepscale=1, + jitter=0, + retry_exceptions=(WebSocketException,), + ) + def assert_websocket_connects(): + with docker_env.devconnect.get_websocket(): + # Connecting successfully is enough. + pass - with docker_env.devconnect.get_websocket(): - # Nothing to do, just connecting successfully is enough. - pass + assert_websocket_connects() def test_bogus_shell_message(self, docker_env): self.assert_env(docker_env) @@ -212,14 +226,50 @@ def detect_shell_prompt(shell): "$ ", ], "Could not detect shell prompt." - with docker_env.devconnect.get_websocket() as ws: - shell = proto_shell.ProtoShell(ws) - body = shell.startShell() - assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL - assert body == proto_shell.MSG_BODY_SHELL_STARTED - - detect_shell_prompt(shell) - is_shell_working(shell) + # Poll the full open-and-start-shell sequence until it succeeds. After a + # network outage the device reconnects to deviceconnect only after + # connection backoff and the session can flap, so neither a fixed sleep + # nor polling get_websocket() (which only checks the management side) + # is reliable: + # - the mgmt /connect endpoint returns HTTP 404 ("device + # disconnected") until the device side is back, and + # - a just-reconnected session may not answer startShell yet + # (recv times out). + # Retrying the whole sequence is the only signal that proves a working + # shell. 48 * 5s = 240s covers the worst-case recovery after the 128s + # drop below; retriable re-raises the last error if it never recovers. + # AssertionErrors are not retried, so a genuine protocol failure still + # fails fast. (QA-1527) + @retriable( + attempts=48, + sleeptime=5, + sleepscale=1, + jitter=0, + retry_exceptions=(WebSocketException, TimeoutError), + ) + def assert_working_shell(): + with docker_env.devconnect.get_websocket() as ws: + shell = proto_shell.ProtoShell(ws) + body = shell.startShell() + if ( + shell.protomsg.props["status"] != protomsg.PROP_STATUS_NORMAL + and body + and b"already running" in body + ): + # A shell started by a previous attempt that flapped mid-use + # may not be reaped yet (the shell limit is 1 per device). + # Treat it as not-ready and let the retry wait for the device + # to release it instead of failing on the assert below. + raise TimeoutError( + "shell from a previous attempt is still running" + ) + assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL + assert body == proto_shell.MSG_BODY_SHELL_STARTED + + detect_shell_prompt(shell) + is_shell_working(shell) + + assert_working_shell() docker_env.device.run("apt-get update") docker_env.device.run("apt-get install -y iptables") @@ -236,17 +286,21 @@ def detect_shell_prompt(shell): # Re-enable a good connection docker_env.device.run("iptables -D OUTPUT 1") - time.sleep(128) - # mender-connect should have "healed" now and be able to start a new shell - with docker_env.devconnect.get_websocket() as ws: - shell = proto_shell.ProtoShell(ws) - body = shell.startShell() - assert shell.protomsg.props["status"] == protomsg.PROP_STATUS_NORMAL - assert body == proto_shell.MSG_BODY_SHELL_STARTED + # mender-connect's reconnect backoff escalates per-attempt and caps at + # 30 minutes (see connectionmanager/exponentialbackoff.go in + # mender-connect); after a 128s outage the backoff timer can leave the + # next reconnect attempt minutes away, which is not testable in a CI + # window. Restart mender-connect so the fresh process starts at + # attempts=0 and reconnects on its first cycle. The entrypoint's + # supervise loop respawns it; this mirrors the canonical pattern used + # by test_filetransfer.update_limits(). (QA-1527, QA-1591) + docker_env.device.run( + "kill -TERM `pidof mender-connect` 2>/dev/null || true" + ) - detect_shell_prompt(shell) - is_shell_working(shell) + # Poll until a working shell can be opened end-to-end. + assert_working_shell() @flaky(max_runs=3) def test_session_recording(self, docker_env): diff --git a/tests/tests/test_monitor_client.py b/tests/tests/test_monitor_client.py index 6c7905499..6a2ff6d4e 100644 --- a/tests/tests/test_monitor_client.py +++ b/tests/tests/test_monitor_client.py @@ -1593,30 +1593,41 @@ def test_monitorclient_remove_old_alerts(self, monitor_commercial_setup_no_clien ) mender_device.run("systemctl start mender-monitor") - # T8: mender-monitor started - time.sleep(alert_resend_interval_s) - time.sleep(alert_resend_interval_s) - - # Shift by 1s to avoid race condition when checking - time.sleep(1) + def count_alert_keys(): + output = mender_device.run( + "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" + + "keys_nolock | wc -l;'" + ) + return int(output.strip()) + + def wait_for_alert_count(expected, max_wait_s): + # mender-monitor purges expired records on a periodic loop + # (DEFAULT_ALERT_STORE_RESEND_INTERVAL_S) whose restart latency + # varies, so poll for the expected count instead of reading once at a + # fixed offset, which races the purge cycle. The store drains + # monotonically (4 -> 2 -> 0), so 1s polling reliably observes each + # plateau. (QA-1527) + deadline = time.monotonic() + max_wait_s + count = None + while time.monotonic() < deadline: + count = count_alert_keys() + logger.info( + "test_monitorclient_remove_old_alerts: %d keys in store" % count + ) + if count == expected: + return + time.sleep(1) + assert count == expected, "expected %d alert keys in store, got %s" % ( + expected, + count, + ) - # T16+1: key1, key2 expired - output = mender_device.run( - "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" - + "keys_nolock | wc -l;'" - ) - logger.info("test_monitorclient_remove_old_alerts got %s keys" % output) - assert output == "2\n" + # key1, key2 expire at alert_max_age, ~8s before key3, key4 (inserted 8s + # later), leaving a wide window where exactly 2 keys remain. + wait_for_alert_count(2, max_wait_s=2 * alert_max_age) - time.sleep(alert_resend_interval_s) - time.sleep(alert_resend_interval_s) - # T24+1: key3, key4 expired - output = mender_device.run( - "bash -c 'cd /usr/share/mender-monitor && . lib/fixlenstore-lib.sh;" - + "keys_nolock | wc -l;'" - ) - logger.info("test_monitorclient_remove_old_alerts got %s keys" % output) - assert output == "0\n" + # key3, key4 expire ~8s after key1, key2, draining the store completely. + wait_for_alert_count(0, max_wait_s=2 * alert_max_age) mender_device.run( "mv /usr/share/mender-monitor/config/config.sh.backup /usr/share/mender-monitor/config/config.sh" diff --git a/testutils/infra/device.py b/testutils/infra/device.py index 99fd4a32a..3d1bbe64f 100644 --- a/testutils/infra/device.py +++ b/testutils/infra/device.py @@ -427,19 +427,13 @@ def _run(conn, cmd, **kw): result = None start_time = time.time() sleeptime = 1 + max_sleeptime = 30 # cap to avoid multi-minute gaps between retries while time.time() < start_time + wait: - # Back off exponentially to save SSH handshakes in QEMU, which - # are quite expensive. - time.sleep(sleeptime) - sleeptime *= 2 - try: result = conn.run(cmd, **kw) break except ConnectionError as e: logger.info(f"Got SSH exception while connecting to host {conn.host}: {e}") - time.sleep(60) - continue except OSError as e: # The OSError is happening while there is no QEMU instance initialized logger.info( @@ -449,12 +443,17 @@ def _run(conn, cmd, **kw): ) if "Cannot assign requested address" not in str(e): raise e - continue except Exception as e: logger.exception( f"Generic exception happened while connecting to host {conn.host}" ) raise e + + # Back off exponentially to save SSH handshakes in QEMU, which + # are quite expensive. Capped so we never wait longer than + # max_sleeptime between attempts. + time.sleep(sleeptime) + sleeptime = min(sleeptime * 2, max_sleeptime) else: raise RuntimeError( f"Could not successfully run command after {wait} seconds on host {conn.host}: {cmd}"