diff --git a/Cargo.lock b/Cargo.lock index cc95e2e8e3..73a365837b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10042,6 +10042,7 @@ dependencies = [ "get_resources", "guid", "hyperv_ic_resources", + "inspect", "jiff", "kmsg", "memory_range", diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 25bb4472df..2e08fa03b9 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -1308,32 +1308,29 @@ impl AsyncRun for DriverWorkerTask { stop: &mut task_control::StopTask<'_>, state: &mut WorkerState, ) -> Result<(), task_control::Cancelled> { - let r = stop - .until_stopped(async { - loop { - match self.recv.next().await { - Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { - rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) - .await - } - Some(NvmeWorkerRequest::Save(rpc)) => { - rpc.handle(async |span| { - let child_span = tracing::info_span!( - parent: &span, - "nvme_worker_save", - pci_id = %self.device.id() - ); - self.save(state).instrument(child_span).await - }) - .await - } - None => break, - } + loop { + let cmd = stop.until_stopped(self.recv.next()).await?; + match cmd { + Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { + rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) + .await } - }) - .await; - tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting"); - r + Some(NvmeWorkerRequest::Save(rpc)) => { + rpc.handle(async |span| { + let child_span = tracing::info_span!( + parent: &span, + "nvme_worker_save", + pci_id = %self.device.id() + ); + self.save(state).instrument(child_span).await + }) + .await + } + None => break, + } + } + tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting cleanly"); + Ok(()) } } diff --git a/vmm_tests/vmm_tests/Cargo.toml b/vmm_tests/vmm_tests/Cargo.toml index c2e7740a73..f19360d8bc 100644 --- a/vmm_tests/vmm_tests/Cargo.toml +++ b/vmm_tests/vmm_tests/Cargo.toml @@ -23,6 +23,7 @@ tmk_tests.workspace = true petri_artifacts_common.workspace = true petri.workspace = true +inspect.workspace = true get_resources.workspace = true openvmm_defs.workspace = true diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e3c74aa96f..19f508fe10 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1169,6 +1169,151 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } +/// Verifies that save works correctly when a create_io_queue command +/// is still in flight and inspect is called on the device. Previously we saw +/// inspect calls inadvertently throwing away create_io_issuer futures and then +/// save being serviced with CREATE_IO_COMPLETION_QUEUE commands still pending. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_slow_create_io_queue_with_inspect( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + const QUEUE_CREATION_DELAY: Duration = Duration::from_secs(60); + const TRIGGER_CREATE_IO_QUEUE_TIMEOUT: Duration = Duration::from_secs(5); + const TOTAL_SAVE_TIMEOUT: Duration = Duration::from_secs(15); + + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + let mut fault_start_updater = CellUpdater::new(false); + + let fault_configuration = FaultConfiguration::new(fault_start_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Delay(QUEUE_CREATION_DELAY), + ), + ); + + let scsi_controller_guid = Guid::new_random(); + let disk_size = 4 * 1024 * 1024; // 4 MiB — enough for dd reads + let vp_count = 4; + + let (mut vm, agent) = create_keepalive_test_config_custom_vps( + config, + fault_configuration, + VTL0_NVME_LUN, + scsi_controller_guid, + disk_size, + vp_count, + ) + .await?; + agent.ping().await?; + + let cpus_with_issuers = find_cpus_with_io_issuers(&vm).await?; + let target_cpu = (0u32..vp_count) + .find(|cpu| !cpus_with_issuers.contains(cpu)) + .unwrap_or_else(|| { + panic!( + "all {vp_count} CPUs already have IO issuers after boot — \ + test cannot exercise create_io_queue. Consider increasing vp_count." + ) + }); + tracing::info!( + target_cpu, + existing_issuers = ?cpus_with_issuers, + "selected target CPU with no IO issuer" + ); + + // Resolve the disk path before save. The device might appear as /dev/sda + // or /dev/sdb depending on timing. + let device_paths = get_device_paths( + &agent, + scsi_controller_guid, + vec![ExpectedGuestDevice { + lun: VTL0_NVME_LUN, + disk_size_sectors: (disk_size / SCSI_SECTOR_SIZE) as usize, + friendly_name: "nvme_disk".to_string(), + }], + ) + .await?; + assert!(device_paths.len() == 1); + let disk_path = &device_paths[0]; + + // DEV NOTE: `run_cpu_pinned_io` only needs to be run for a duration that + // guarantees the create_io_queue command getting stuck. Ideally this should + // be event driven instead of time driven, but the infrastructure for that + // is not in place yet. + // Even though the dd command will timeout, the run loop will be stuck until + // the create_io_queue command completes. + fault_start_updater.set(true).await; + let io_result = CancelContext::new() + .with_timeout(TRIGGER_CREATE_IO_QUEUE_TIMEOUT) + .until_cancelled(run_cpu_pinned_io(&agent, disk_path, target_cpu)) + .await; + + assert!( + io_result.is_err(), + "IO command should have timed out. This likely means the create_io_queue command did not get injected correctly." + ); + + // In previous versions invoking inspect would cause the DriverWorkerTask to + // just drop the stuck create io queue command and service the save with + // pending admin commands (not good) + let nvme_device_inspect = vm.inspect_openhcl("vm/nvme/devices", None, None).await?; + tracing::info!(nvme_device_inspect = ?nvme_device_inspect, "nvme device inspected"); + + let entries = match &nvme_device_inspect { + inspect::Node::Dir(entries) => entries, + _ => panic!( + "expected dir for 'vm/nvme/devices' but found {}", + nvme_device_inspect.json() + ), + }; + assert_eq!( + entries.len(), + 1, + "expected exactly 1 NVMe device under 'vm/nvme/devices', found {}", + entries.len() + ); + let nvme_device_name = entries[0].name.clone(); + + CancelContext::new() + .with_timeout(TOTAL_SAVE_TIMEOUT) + .until_cancelled(vm.save_openhcl(igvm_file.clone(), flags)) + .await + .expect("VM save did not complete within the given timeout, even though it should have. Save is stuck when draining after restore with slow create_io_queue.") + .expect("Save failed"); + + fault_start_updater.set(false).await; + + vm.restore_openhcl().await?; + agent.ping().await?; + + let vm_inspect = vm + .inspect_openhcl( + &format!("vm/nvme/devices/{nvme_device_name}/driver/driver/admin/commands/commands"), + None, + None, + ) + .await?; + + tracing::info!("vm inspected {}", vm_inspect.json()); + let entries = match &vm_inspect { + inspect::Node::Dir(entries) => entries, + _ => { + panic!( + "expected list of pending commands but found {}", + vm_inspect.json() + ); + } + }; + assert_eq!(entries.len(), 1, "expected only 1 entry, the AER command"); + + Ok(()) +} + async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration,