From 925bc9e42b7feda44799f5e15dc27ffa620cef10 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 11:10:30 -0700 Subject: [PATCH 01/11] Adding a test to inspect while create_io_queue is in flight --- .../tests/multiarch/openhcl_servicing.rs | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 631e120505..dfa7d35806 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1176,6 +1176,114 @@ async fn apply_fault_with_keepalive( Ok(vm) } +/// Verifies that save works correctly when a create_io_queue command +/// is stuck. The `DriverWorkerTask` run loop should still be able to process +/// save commands when the stuck create_io_queue command completes, even when +/// that happens after save has been issued. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_slow_create_io_queue_with_inspect( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(20); + const TRIGGER_CREATE_IO_QUEUE_TIMEOUT: Duration = Duration::from_secs(5); + const TOTAL_SAVE_TIMEOUT: Duration = Duration::from_secs(50); + + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + let mut fault_start_updater = CellUpdater::new(false); + + let fault_configuration = FaultConfiguration::new(fault_start_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Delay(QUEUE_CREATION_DELAY), + ), + ); + + let scsi_controller_guid = Guid::new_random(); + let disk_size = 4 * 1024 * 1024; // 4 MiB — enough for dd reads + let vp_count = 6; + + let (mut vm, agent) = create_keepalive_test_config_custom_vps( + config, + fault_configuration, + VTL0_NVME_LUN, + scsi_controller_guid, + disk_size, + vp_count, + ) + .await?; + agent.ping().await?; + + let cpus_with_issuers = find_cpus_with_io_issuers(&vm).await?; + let target_cpu = (0u32..vp_count) + .find(|cpu| !cpus_with_issuers.contains(cpu)) + .unwrap_or_else(|| { + panic!( + "all {vp_count} CPUs already have IO issuers after boot — \ + test cannot exercise create_io_queue. Consider increasing vp_count." + ) + }); + tracing::info!( + target_cpu, + existing_issuers = ?cpus_with_issuers, + "selected target CPU with no IO issuer" + ); + + // Resolve the disk path before save. The device might appear as /dev/sda + // or /dev/sdb depending on timing. + let device_paths = get_device_paths( + &agent, + scsi_controller_guid, + vec![ExpectedGuestDevice { + lun: VTL0_NVME_LUN, + disk_size_sectors: (disk_size / SCSI_SECTOR_SIZE) as usize, + friendly_name: "nvme_disk".to_string(), + }], + ) + .await?; + assert!(device_paths.len() == 1); + let disk_path = &device_paths[0]; + + // DEV NOTE: `run_cpu_pinned_io` only needs to be run for a duration that + // guarantees the create_io_queue command getting stuck. Ideally this should + // be event driven instead of time driven, but the infrastructure for that + // is not in place yet. + // Even though the dd command will timeout, the run loop will be stuck until + // the create_io_queue command completes. + fault_start_updater.set(true).await; + let io_result = CancelContext::new() + .with_timeout(TRIGGER_CREATE_IO_QUEUE_TIMEOUT) + .until_cancelled(run_cpu_pinned_io(&agent, disk_path, target_cpu)) + .await;d + + assert!( + io_result.is_err(), + "IO command should have timed out. This likely means the create_io_queue command did not get injected correctly." + ); + + let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; + tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); + + CancelContext::new() + .with_timeout(TOTAL_SAVE_TIMEOUT) + .until_cancelled(vm.save_openhcl(igvm_file.clone(), flags)) + .await + .expect("VM save did not complete within the given timeout, even though it should have. Save is stuck when draining after restore with slow create_io_queue.") + .expect("Save failed"); + + let vm_inspect = vm.inspect_openhcl("", None, None).await?; + tracing::info!(vm_inspect = ?vm_inspect, "vm inspected"); + + fault_start_updater.set(false).await; + vm.restore_openhcl().await?; + agent.ping().await?; + Ok(()) +} + async fn create_keepalive_test_config_default( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From 7611e74d473d2546da8200b946f127f5dac94ea3 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 11:41:50 -0700 Subject: [PATCH 02/11] Saving the test --- vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index dfa7d35806..27f55a1af1 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1258,7 +1258,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( let io_result = CancelContext::new() .with_timeout(TRIGGER_CREATE_IO_QUEUE_TIMEOUT) .until_cancelled(run_cpu_pinned_io(&agent, disk_path, target_cpu)) - .await;d + .await; assert!( io_result.is_err(), From def3ff54d4b0139b181f12947bad335b38230dd0 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 14:21:25 -0700 Subject: [PATCH 03/11] Testing working, but still sometimes triple faulting --- Cargo.lock | 1 + vmm_tests/vmm_tests/Cargo.toml | 1 + .../tests/multiarch/openhcl_servicing.rs | 148 ++++++++++++------ 3 files changed, 104 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d23379eb0e..70a9c50dd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10038,6 +10038,7 @@ dependencies = [ "get_resources", "guid", "hyperv_ic_resources", + "inspect", "jiff", "kmsg", "memory_range", diff --git a/vmm_tests/vmm_tests/Cargo.toml b/vmm_tests/vmm_tests/Cargo.toml index c2e7740a73..f19360d8bc 100644 --- a/vmm_tests/vmm_tests/Cargo.toml +++ b/vmm_tests/vmm_tests/Cargo.toml @@ -23,6 +23,7 @@ tmk_tests.workspace = true petri_artifacts_common.workspace = true petri.workspace = true +inspect.workspace = true get_resources.workspace = true openvmm_defs.workspace = true diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 27f55a1af1..06b7375e89 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1137,45 +1137,6 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } -async fn apply_fault_with_keepalive( - config: PetriVmBuilder, - fault_configuration: FaultConfiguration, - mut fault_start_updater: CellUpdater, - igvm_file: ResolvedArtifact, - new_cmdline: Option<&str>, -) -> Result, anyhow::Error> { - let mut flags = config.default_servicing_flags(); - flags.enable_nvme_keepalive = true; - let (mut vm, agent) = create_keepalive_test_config_default( - config, - fault_configuration, - VTL0_NVME_LUN, - Guid::new_random(), - DEFAULT_DISK_SIZE, - ) - .await?; - - agent.ping().await?; - let sh = agent.unix_shell(); - - // Make sure the disk showed up. - cmd!(sh, "ls /dev/sda").run().await?; - - fault_start_updater.set(true).await; - - if let Some(cmdline) = new_cmdline { - vm.update_command_line(cmdline).await?; - } - - vm.restart_openhcl(igvm_file.clone(), flags).await?; - - // Ensure the agent is responsive after the restart before returning. - agent.ping().await?; - - fault_start_updater.set(false).await; - Ok(vm) -} - /// Verifies that save works correctly when a create_io_queue command /// is stuck. The `DriverWorkerTask` run loop should still be able to process /// save commands when the stuck create_io_queue command completes, even when @@ -1185,7 +1146,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( config: PetriVmBuilder, (igvm_file,): (ResolvedArtifact,), ) -> Result<(), anyhow::Error> { - const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(20); + const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(200); const TRIGGER_CREATE_IO_QUEUE_TIMEOUT: Duration = Duration::from_secs(5); const TOTAL_SAVE_TIMEOUT: Duration = Duration::from_secs(50); @@ -1205,7 +1166,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( let scsi_controller_guid = Guid::new_random(); let disk_size = 4 * 1024 * 1024; // 4 MiB — enough for dd reads - let vp_count = 6; + let vp_count = 4; let (mut vm, agent) = create_keepalive_test_config_custom_vps( config, @@ -1265,8 +1226,10 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( "IO command should have timed out. This likely means the create_io_queue command did not get injected correctly." ); - let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; - tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); + for _ in 0..3 { + let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; + tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); + } CancelContext::new() .with_timeout(TOTAL_SAVE_TIMEOUT) @@ -1275,15 +1238,108 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( .expect("VM save did not complete within the given timeout, even though it should have. Save is stuck when draining after restore with slow create_io_queue.") .expect("Save failed"); - let vm_inspect = vm.inspect_openhcl("", None, None).await?; - tracing::info!(vm_inspect = ?vm_inspect, "vm inspected"); - fault_start_updater.set(false).await; + vm.restore_openhcl().await?; agent.ping().await?; + + let vm_inspect = vm + .inspect_openhcl( + "vm/nvme/devices/182f:00:00.0/driver/driver/admin/commands/commands", + None, + None, + ) + .await?; + + tracing::info!("vm inspected {}", vm_inspect.json()); + let entries = match vm_inspect { + inspect::Node::Dir(entries) => entries, + _ => { + panic!( + "expected node for the list of commands but found {}", + vm_inspect.json() + ); + } + }; + + assert!(entries.len() == 1, "expected only 1 entry, the AER command"); + + // Helper: descend through a chain of named directory children. + fn dir_child<'a>(node: &'a inspect::Node, name: &str) -> &'a inspect::Node { + match node { + inspect::Node::Dir(children) => { + &children + .iter() + .find(|c| c.name == name) + .unwrap_or_else(|| panic!("missing child '{name}' in inspect dir")) + .node + } + other => panic!("expected dir for '{name}', found {:?}", other), + } + } + + let create_io_cq_opcode = u64::from(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0); + + for e in &entries { + let opcode_node = dir_child(dir_child(dir_child(&e.node, "command"), "cdw0"), "opcode"); + let opcode = match opcode_node { + inspect::Node::Value(v) => match v.kind { + inspect::ValueKind::Unsigned(n) => n, + ref other => panic!("opcode for entry '{}' is not unsigned: {:?}", e.name, other), + }, + other => panic!("opcode for entry '{}' is not a value: {:?}", e.name, other), + }; + tracing::info!(entry = %e.name, opcode, "inspected pending command"); + assert_ne!( + opcode, create_io_cq_opcode, + "pending command '{}' has CREATE_IO_COMPLETION_QUEUE opcode (5); \ + expected the slow create_io_queue command to have completed before save", + e.name + ); + } + Ok(()) } +async fn apply_fault_with_keepalive( + config: PetriVmBuilder, + fault_configuration: FaultConfiguration, + mut fault_start_updater: CellUpdater, + igvm_file: ResolvedArtifact, + new_cmdline: Option<&str>, +) -> Result, anyhow::Error> { + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + let (mut vm, agent) = create_keepalive_test_config_default( + config, + fault_configuration, + VTL0_NVME_LUN, + Guid::new_random(), + DEFAULT_DISK_SIZE, + ) + .await?; + + agent.ping().await?; + let sh = agent.unix_shell(); + + // Make sure the disk showed up. + cmd!(sh, "ls /dev/sda").run().await?; + + fault_start_updater.set(true).await; + + if let Some(cmdline) = new_cmdline { + vm.update_command_line(cmdline).await?; + } + + vm.restart_openhcl(igvm_file.clone(), flags).await?; + + // Ensure the agent is responsive after the restart before returning. + agent.ping().await?; + + fault_start_updater.set(false).await; + Ok(vm) +} + async fn create_keepalive_test_config_default( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From ab4e83778fd16af740c622b29cfb1908fe4ea5fb Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 14:21:55 -0700 Subject: [PATCH 04/11] Don't check the tree, only check command count --- .../tests/multiarch/openhcl_servicing.rs | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 06b7375e89..072c28338d 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1261,43 +1261,8 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( ); } }; - assert!(entries.len() == 1, "expected only 1 entry, the AER command"); - // Helper: descend through a chain of named directory children. - fn dir_child<'a>(node: &'a inspect::Node, name: &str) -> &'a inspect::Node { - match node { - inspect::Node::Dir(children) => { - &children - .iter() - .find(|c| c.name == name) - .unwrap_or_else(|| panic!("missing child '{name}' in inspect dir")) - .node - } - other => panic!("expected dir for '{name}', found {:?}", other), - } - } - - let create_io_cq_opcode = u64::from(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0); - - for e in &entries { - let opcode_node = dir_child(dir_child(dir_child(&e.node, "command"), "cdw0"), "opcode"); - let opcode = match opcode_node { - inspect::Node::Value(v) => match v.kind { - inspect::ValueKind::Unsigned(n) => n, - ref other => panic!("opcode for entry '{}' is not unsigned: {:?}", e.name, other), - }, - other => panic!("opcode for entry '{}' is not a value: {:?}", e.name, other), - }; - tracing::info!(entry = %e.name, opcode, "inspected pending command"); - assert_ne!( - opcode, create_io_cq_opcode, - "pending command '{}' has CREATE_IO_COMPLETION_QUEUE opcode (5); \ - expected the slow create_io_queue command to have completed before save", - e.name - ); - } - Ok(()) } From 8902451d57ccd2106172af81f76de3a432d8d45e Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 14:51:51 -0700 Subject: [PATCH 05/11] Test passing now with the fix --- .../disk_nvme/nvme_driver/src/driver.rs | 47 +++++++++---------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 25bb4472df..2e08fa03b9 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -1308,32 +1308,29 @@ impl AsyncRun for DriverWorkerTask { stop: &mut task_control::StopTask<'_>, state: &mut WorkerState, ) -> Result<(), task_control::Cancelled> { - let r = stop - .until_stopped(async { - loop { - match self.recv.next().await { - Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { - rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) - .await - } - Some(NvmeWorkerRequest::Save(rpc)) => { - rpc.handle(async |span| { - let child_span = tracing::info_span!( - parent: &span, - "nvme_worker_save", - pci_id = %self.device.id() - ); - self.save(state).instrument(child_span).await - }) - .await - } - None => break, - } + loop { + let cmd = stop.until_stopped(self.recv.next()).await?; + match cmd { + Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { + rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) + .await } - }) - .await; - tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting"); - r + Some(NvmeWorkerRequest::Save(rpc)) => { + rpc.handle(async |span| { + let child_span = tracing::info_span!( + parent: &span, + "nvme_worker_save", + pci_id = %self.device.id() + ); + self.save(state).instrument(child_span).await + }) + .await + } + None => break, + } + } + tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting cleanly"); + Ok(()) } } From a0c42c9561d0bb89d869bad76713240f9bcce54d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 15:05:53 -0700 Subject: [PATCH 06/11] Adding in some logging --- .../tests/tests/multiarch/openhcl_servicing.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 072c28338d..193680ec79 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1138,17 +1138,17 @@ async fn servicing_keepalive_slow_create_io_queue( } /// Verifies that save works correctly when a create_io_queue command -/// is stuck. The `DriverWorkerTask` run loop should still be able to process -/// save commands when the stuck create_io_queue command completes, even when -/// that happens after save has been issued. +/// is still in flight and inspect is called on the device. Previously we saw +/// inspect calls inadvertently throwing away create_io_issuer futures and then +/// save being serviced with CREATE_IO_COMPLETION_QUEUE commands still pending. #[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] async fn servicing_keepalive_slow_create_io_queue_with_inspect( config: PetriVmBuilder, (igvm_file,): (ResolvedArtifact,), ) -> Result<(), anyhow::Error> { - const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(200); + const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(60); const TRIGGER_CREATE_IO_QUEUE_TIMEOUT: Duration = Duration::from_secs(5); - const TOTAL_SAVE_TIMEOUT: Duration = Duration::from_secs(50); + const TOTAL_SAVE_TIMEOUT: Duration = Duration::from_secs(15); let mut flags = config.default_servicing_flags(); flags.enable_nvme_keepalive = true; @@ -1226,7 +1226,10 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( "IO command should have timed out. This likely means the create_io_queue command did not get injected correctly." ); - for _ in 0..3 { + // In previous versions invoking inspect would cause the DriverWorkerTask to + // just drop the stuck create io queue command and service the save with + // pending admin commands (not good) + for _ in 0..2 { let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); } From 03703f4ae55ec47d8103b0e48d3e70d760e17605 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 29 Apr 2026 15:18:18 -0700 Subject: [PATCH 07/11] Some more minor updates --- .../tests/multiarch/openhcl_servicing.rs | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 193680ec79..e991a1d997 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1229,10 +1229,23 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( // In previous versions invoking inspect would cause the DriverWorkerTask to // just drop the stuck create io queue command and service the save with // pending admin commands (not good) - for _ in 0..2 { - let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; - tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); - } + let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; + tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); + + let entries = match &nvme_device_inspect { + inspect::Node::Dir(entries) => entries, + _ => panic!( + "expected dir for 'vm/nvme/devices' but found {}", + nvme_device_inspect.json() + ), + }; + assert_eq!( + entries.len(), + 1, + "expected exactly 1 NVMe device under 'vm/nvme/devices', found {}", + entries.len() + ); + let nvme_device_name = entries[0].name.clone(); CancelContext::new() .with_timeout(TOTAL_SAVE_TIMEOUT) @@ -1248,7 +1261,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( let vm_inspect = vm .inspect_openhcl( - "vm/nvme/devices/182f:00:00.0/driver/driver/admin/commands/commands", + &format!("vm/nvme/devices/{nvme_device_name}/driver/driver/admin/commands/commands"), None, None, ) @@ -1259,7 +1272,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( inspect::Node::Dir(entries) => entries, _ => { panic!( - "expected node for the list of commands but found {}", + "expected list of pending commands but found {}", vm_inspect.json() ); } From 852ba4ba2ba6057b82c16cb8b4c4ae2b30b04289 Mon Sep 17 00:00:00 2001 From: Guramrit Singh <127339643+gurasinghMS@users.noreply.github.com> Date: Wed, 29 Apr 2026 15:38:58 -0700 Subject: [PATCH 08/11] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e991a1d997..eb98ccb984 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1277,7 +1277,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( ); } }; - assert!(entries.len() == 1, "expected only 1 entry, the AER command"); + assert_eq!(entries.len(), 1, "expected only 1 entry, the AER command"); Ok(()) } From 1bd18f000e5755b867b031ed18f639750d28529f Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 13:39:21 -0700 Subject: [PATCH 09/11] Mesh channels should be borrowed instead of taken making the queue handler loop cancel proof --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 73fc432a53..bed5d294f1 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -347,8 +347,8 @@ impl DrainAfterRestore { struct QueueHandlerLoop { queue_handler: QueueHandler, registers: Arc>, - recv_req: Option>, - recv_cmd: Option>, + recv_req: mesh::Receiver, + recv_cmd: mesh::Receiver, interrupt: DeviceInterrupt, } @@ -362,8 +362,8 @@ impl AsyncRun<()> for QueueHandlerLoop { self.queue_handler .run( &self.registers, - self.recv_req.take().unwrap(), - self.recv_cmd.take().unwrap(), + &mut self.recv_req, + &mut self.recv_cmd, &mut self.interrupt, ) .await; @@ -523,8 +523,8 @@ impl QueuePair { let mut task = TaskControl::new(QueueHandlerLoop { queue_handler, registers, - recv_req: Some(recv_req), - recv_cmd: Some(recv_cmd), + recv_req, + recv_cmd, interrupt, }); task.insert(spawner, "nvme-queue", ()); @@ -1159,11 +1159,13 @@ struct QueueStats { } impl QueueHandler { + /// Run the queue handler loop. This needs to be cancel safe because the + /// task can be stopped at any time and restarted. async fn run( &mut self, registers: &DeviceRegisters, - mut recv_req: mesh::Receiver, - mut recv_cmd: mesh::Receiver, + recv_req: &mut mesh::Receiver, + recv_cmd: &mut mesh::Receiver, interrupt: &mut DeviceInterrupt, ) { if matches!( @@ -1247,7 +1249,7 @@ impl QueueHandler { Event::Request(req) => match req { Req::Save(queue_state) => { tracing::info!(pci_id = ?self.device_id, qid = ?self.qid, "received save request, shutting down ..."); - queue_state.complete(self.save().await); + queue_state.complete(self.save()); // Do not allow any more processing after save completed. break; } @@ -1310,7 +1312,7 @@ impl QueueHandler { } /// Save queue data for servicing. - pub async fn save(&self) -> anyhow::Result { + pub fn save(&self) -> anyhow::Result { // Log pending admin command wait durations at save time. if self.qid == 0 { for (_index, cmd) in self.commands.commands.iter() { From 10bee8b30c3d26014f62e3391b3a45f675523a3f Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 14:26:01 -0700 Subject: [PATCH 10/11] Revert "Mesh channels should be borrowed instead of taken making the queue handler loop cancel proof" This reverts commit 1bd18f000e5755b867b031ed18f639750d28529f. --- .../disk_nvme/nvme_driver/src/queue_pair.rs | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index bed5d294f1..73fc432a53 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -347,8 +347,8 @@ impl DrainAfterRestore { struct QueueHandlerLoop { queue_handler: QueueHandler, registers: Arc>, - recv_req: mesh::Receiver, - recv_cmd: mesh::Receiver, + recv_req: Option>, + recv_cmd: Option>, interrupt: DeviceInterrupt, } @@ -362,8 +362,8 @@ impl AsyncRun<()> for QueueHandlerLoop { self.queue_handler .run( &self.registers, - &mut self.recv_req, - &mut self.recv_cmd, + self.recv_req.take().unwrap(), + self.recv_cmd.take().unwrap(), &mut self.interrupt, ) .await; @@ -523,8 +523,8 @@ impl QueuePair { let mut task = TaskControl::new(QueueHandlerLoop { queue_handler, registers, - recv_req, - recv_cmd, + recv_req: Some(recv_req), + recv_cmd: Some(recv_cmd), interrupt, }); task.insert(spawner, "nvme-queue", ()); @@ -1159,13 +1159,11 @@ struct QueueStats { } impl QueueHandler { - /// Run the queue handler loop. This needs to be cancel safe because the - /// task can be stopped at any time and restarted. async fn run( &mut self, registers: &DeviceRegisters, - recv_req: &mut mesh::Receiver, - recv_cmd: &mut mesh::Receiver, + mut recv_req: mesh::Receiver, + mut recv_cmd: mesh::Receiver, interrupt: &mut DeviceInterrupt, ) { if matches!( @@ -1249,7 +1247,7 @@ impl QueueHandler { Event::Request(req) => match req { Req::Save(queue_state) => { tracing::info!(pci_id = ?self.device_id, qid = ?self.qid, "received save request, shutting down ..."); - queue_state.complete(self.save()); + queue_state.complete(self.save().await); // Do not allow any more processing after save completed. break; } @@ -1312,7 +1310,7 @@ impl QueueHandler { } /// Save queue data for servicing. - pub fn save(&self) -> anyhow::Result { + pub async fn save(&self) -> anyhow::Result { // Log pending admin command wait durations at save time. if self.qid == 0 { for (_index, cmd) in self.commands.commands.iter() { From 4d5be88912c66a01fc38708bcb6a775c34bb859a Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 14:29:30 -0700 Subject: [PATCH 11/11] Use a reference to the node to avoid a move --- vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 6bb0a0beca..19f508fe10 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1300,7 +1300,7 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( .await?; tracing::info!("vm inspected {}", vm_inspect.json()); - let entries = match vm_inspect { + let entries = match &vm_inspect { inspect::Node::Dir(entries) => entries, _ => { panic!(