From 44deb609ec4dd9b8a9f8c0bc04dcdcf953be34d4 Mon Sep 17 00:00:00 2001 From: Denys Shchedrivyi Date: Wed, 6 May 2026 18:19:47 -0700 Subject: [PATCH 1/2] [VIRT] Refactor descheduler tests Remove flaky and duplicate tests. Changes: - Remove test_no_migrations_storm tests - flaky, cascading migrations are legitimate rebalancing behavior - Remove TestDeschedulerEvictsVMAfterDrainUncordon - basically it duplicates the utilization imbalance test, just with different setup Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Denys Shchedrivyi --- tests/virt/node/descheduler/conftest.py | 47 +----------- .../virt/node/descheduler/test_descheduler.py | 76 +------------------ tests/virt/node/descheduler/utils.py | 47 ------------ 3 files changed, 2 insertions(+), 168 deletions(-) diff --git a/tests/virt/node/descheduler/conftest.py b/tests/virt/node/descheduler/conftest.py index c78c2ab4aa..ace7b4839e 100644 --- a/tests/virt/node/descheduler/conftest.py +++ b/tests/virt/node/descheduler/conftest.py @@ -20,7 +20,6 @@ deploy_vms, vm_nodes, vms_per_nodes, - wait_vmi_failover, ) from tests.virt.utils import ( build_node_affinity_dict, @@ -29,11 +28,7 @@ ) from utilities.constants import TIMEOUT_5MIN, TIMEOUT_5SEC from utilities.infra import wait_for_pods_deletion -from utilities.virt import ( - node_mgmt_console, - wait_for_migration_finished, - wait_for_node_schedulable_status, -) +from utilities.virt import wait_for_migration_finished LOGGER = logging.getLogger(__name__) @@ -123,46 +118,6 @@ def deployed_vms_for_descheduler_test( ) -@pytest.fixture(scope="class") -def vms_orig_nodes_before_node_drain(deployed_vms_for_descheduler_test): - return vm_nodes(vms=deployed_vms_for_descheduler_test) - - -@pytest.fixture(scope="class") -def vms_boot_time_before_node_drain( - deployed_vms_for_descheduler_test, -): - yield get_boot_time_for_multiple_vms(vm_list=deployed_vms_for_descheduler_test) - - -@pytest.fixture(scope="class") -def node_to_drain( - schedulable_nodes, - vms_orig_nodes_before_node_drain, -): - vm_per_node_counters = vms_per_nodes(vms=vms_orig_nodes_before_node_drain) - for node in schedulable_nodes: - if vm_per_node_counters[node.name] > 0: - return node - - raise ValueError("No suitable node to drain") - - -@pytest.fixture() -def drain_uncordon_node( - admin_client, - deployed_vms_for_descheduler_test, - vms_orig_nodes_before_node_drain, - node_to_drain, -): - """Return when node is schedulable again after uncordon""" - with node_mgmt_console(admin_client=admin_client, node=node_to_drain, node_mgmt="drain"): - wait_for_node_schedulable_status(node=node_to_drain, status=False) - for vm in deployed_vms_for_descheduler_test: - if vms_orig_nodes_before_node_drain[vm.name].name == node_to_drain.name: - wait_vmi_failover(vm=vm, orig_node=vms_orig_nodes_before_node_drain[vm.name]) - - @pytest.fixture() def all_existing_migrations_completed(admin_client, namespace): # Descheduler may trigger multiple migrations, need to wait when all succeeded diff --git a/tests/virt/node/descheduler/test_descheduler.py b/tests/virt/node/descheduler/test_descheduler.py index ccd41ffbaf..de52e411d9 100644 --- a/tests/virt/node/descheduler/test_descheduler.py +++ b/tests/virt/node/descheduler/test_descheduler.py @@ -1,18 +1,13 @@ -import logging - import pytest from ocp_resources.resource import ResourceEditor from tests.virt.node.descheduler.constants import DESCHEDULER_TEST_LABEL from tests.virt.node.descheduler.utils import ( assert_vms_consistent_virt_launcher_pods, - assert_vms_distribution_after_failover, verify_at_least_one_vm_migrated, ) from tests.virt.utils import verify_guest_boot_time -LOGGER = logging.getLogger(__name__) - pytestmark = [ pytest.mark.tier3, pytest.mark.descheduler, @@ -23,59 +18,6 @@ ), ] -NO_MIGRATION_STORM_ASSERT_MESSAGE = "Verify no migration storm after triggered migrations by the descheduler." - - -@pytest.mark.parametrize( - "calculated_vm_deployment_for_descheduler_test", - [pytest.param(0.50)], - indirect=True, -) -class TestDeschedulerEvictsVMAfterDrainUncordon: - TESTS_CLASS_NAME = "TestDeschedulerEvictsVMAfterDrainUncordon" - - @pytest.mark.dependency(name=f"{TESTS_CLASS_NAME}::test_descheduler_evicts_vm_after_drain_uncordon") - @pytest.mark.polarion("CNV-5922") - def test_descheduler_evicts_vm_after_drain_uncordon( - self, - schedulable_nodes, - deployed_vms_for_descheduler_test, - vms_boot_time_before_node_drain, - drain_uncordon_node, - ): - assert_vms_distribution_after_failover( - vms=deployed_vms_for_descheduler_test, - nodes=schedulable_nodes, - ) - - @pytest.mark.dependency( - name=f"{TESTS_CLASS_NAME}::test_no_migrations_storm", - depends=[f"{TESTS_CLASS_NAME}::test_descheduler_evicts_vm_after_drain_uncordon"], - ) - @pytest.mark.polarion("CNV-7316") - def test_no_migrations_storm( - self, - deployed_vms_for_descheduler_test, - all_existing_migrations_completed, - admin_client, - ): - LOGGER.info(NO_MIGRATION_STORM_ASSERT_MESSAGE) - assert_vms_consistent_virt_launcher_pods( - running_vms=deployed_vms_for_descheduler_test, admin_client=admin_client - ) - - @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_no_migrations_storm"]) - @pytest.mark.polarion("CNV-8288") - def test_boot_time_after_migrations_complete( - self, - deployed_vms_for_descheduler_test, - vms_boot_time_before_node_drain, - ): - verify_guest_boot_time( - vm_list=deployed_vms_for_descheduler_test, - initial_boot_time=vms_boot_time_before_node_drain, - ) - @pytest.mark.parametrize( "calculated_vm_deployment_for_node_with_least_available_memory, deployed_vms_for_utilization_imbalance", @@ -108,23 +50,7 @@ def test_descheduler_evicts_vm_from_utilization_imbalance( vms=deployed_vms_for_utilization_imbalance, node_before=node_with_least_available_memory ) - @pytest.mark.dependency( - name=f"{TESTS_CLASS_NAME}::test_no_migrations_storm", - depends=[f"{TESTS_CLASS_NAME}::test_descheduler_evicts_vm_from_utilization_imbalance"], - ) - @pytest.mark.polarion("CNV-8918") - def test_no_migrations_storm( - self, - deployed_vms_for_utilization_imbalance, - all_existing_migrations_completed, - admin_client, - ): - LOGGER.info(NO_MIGRATION_STORM_ASSERT_MESSAGE) - assert_vms_consistent_virt_launcher_pods( - running_vms=deployed_vms_for_utilization_imbalance, admin_client=admin_client - ) - - @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_no_migrations_storm"]) + @pytest.mark.dependency(depends=[f"{TESTS_CLASS_NAME}::test_descheduler_evicts_vm_from_utilization_imbalance"]) @pytest.mark.polarion("CNV-8919") def test_boot_time_after_migrations_complete( self, diff --git a/tests/virt/node/descheduler/utils.py b/tests/virt/node/descheduler/utils.py index c9550fb8d2..3712b746c1 100644 --- a/tests/virt/node/descheduler/utils.py +++ b/tests/virt/node/descheduler/utils.py @@ -19,7 +19,6 @@ TIMEOUT_5MIN, TIMEOUT_5SEC, TIMEOUT_10MIN, - TIMEOUT_15MIN, TIMEOUT_20SEC, NamespacesNames, ) @@ -95,52 +94,6 @@ def calculate_vm_deployment( return vm_deployment -def wait_vmi_failover(vm, orig_node): - samples = TimeoutSampler(wait_timeout=TIMEOUT_15MIN, sleep=TIMEOUT_5SEC, func=lambda: vm.vmi.node.name) - LOGGER.info(f"Waiting for {vm.name} to be moved from node {orig_node.name}") - try: - for sample in samples: - if sample and sample != orig_node.name: - return - except TimeoutExpiredError: - LOGGER.error(f"VM {vm.name} failed to deploy on new node") - raise - - -def assert_vms_distribution_after_failover(vms, nodes, all_nodes=True): - def _get_vms_per_nodes(): - return vms_per_nodes(vms=vm_nodes(vms=vms)) - - # Allow the descheduler to cycle multiple times before returning. - # The value can be affected by high pod counts or load within - # the cluster which increases the descheduler runtime. - descheduling_failover_timeout = DESCHEDULING_INTERVAL_120SEC * 3 - - if all_nodes: - LOGGER.info("Verify all nodes have at least one VM running") - else: - LOGGER.info("Verify at least one node has a VM running") - - samples = TimeoutSampler( - wait_timeout=descheduling_failover_timeout, - sleep=TIMEOUT_5SEC, - func=_get_vms_per_nodes, - ) - vms_per_nodes_dict = None - try: - for vms_per_nodes_dict in samples: - vm_counts = [vm_count for vm_count in vms_per_nodes_dict.values() if vm_count] - if all_nodes and len(vm_counts) == len(nodes): - LOGGER.info(f"Every node has at least one VM running on it: {vms_per_nodes_dict}") - return - elif vm_counts and not all_nodes: - LOGGER.info(f"There is at least one node with a VM running on it: {vms_per_nodes_dict}") - return - except TimeoutExpiredError: - LOGGER.error(f"Running VMs missing from nodes: {vms_per_nodes_dict}") - raise - - def vms_per_nodes(vms): """ Args: From ce064cc9840b8da59ffb9c03973d8cafb6013a93 Mon Sep 17 00:00:00 2001 From: Denys Shchedrivyi Date: Thu, 7 May 2026 13:56:21 -0700 Subject: [PATCH 2/2] [VIRT] Fix flaky PSI metrics tests by selecting node with most VMs The node_to_run_stress fixture was picking the first node with any VMs, which could result in insufficient load. Now select the node with the MOST VMs to maximize stress load and ensure the overutilization threshold. Signed-off-by: Denys Shchedrivyi --- tests/virt/node/descheduler/conftest.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/virt/node/descheduler/conftest.py b/tests/virt/node/descheduler/conftest.py index ace7b4839e..ade5c543f2 100644 --- a/tests/virt/node/descheduler/conftest.py +++ b/tests/virt/node/descheduler/conftest.py @@ -275,10 +275,12 @@ def utilization_imbalance( @pytest.fixture(scope="class") def node_to_run_stress(schedulable_nodes, deployed_vms_for_descheduler_test): vm_per_node_counters = vms_per_nodes(vms=vm_nodes(vms=deployed_vms_for_descheduler_test)) - for node in schedulable_nodes: - if vm_per_node_counters[node.name] > 0: - LOGGER.info(f"Node to run stress: {node.name}") - return node + node_with_most_vms = max(schedulable_nodes, key=lambda node: vm_per_node_counters.get(node.name, 0)) + if vm_per_node_counters[node_with_most_vms.name] > 0: + LOGGER.info( + f"Node to run stress: {node_with_most_vms.name} with {vm_per_node_counters[node_with_most_vms.name]} VMs" + ) + return node_with_most_vms raise ValueError("No suitable node to run stress")