From 0a9f8715a1e5d5d4b0d840dd205ab6d74b696453 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 14:46:36 -0700 Subject: [PATCH 01/17] Should enable keepalive but only for ASAP devices --- .../src/nvme_manager/manager.rs | 41 ++++++++++++++++--- .../underhill_core/src/nvme_manager/mod.rs | 17 ++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 1b59109fec..d901748574 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -5,6 +5,7 @@ use crate::nvme_manager::CreateNvmeDriver; use crate::nvme_manager::device::NvmeDriverManager; use crate::nvme_manager::device::NvmeDriverManagerClient; use crate::nvme_manager::device::NvmeDriverShutdownOptions; +use crate::nvme_manager::is_nvme_keepalive_compatible; use crate::nvme_manager::save_restore::NvmeManagerSavedState; use crate::nvme_manager::save_restore::NvmeSavedDiskConfig; use crate::servicing::NvmeSavedState; @@ -301,12 +302,27 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { + // nvme_keepalive is received from host but it is only valid + // when memory pool allocator supports save/restore. Further, + // as a partial mitigation for known incompatibilities between + // keepalive and NVMe Direct v2 devices, we only honor + // keepalive for ASAP devices (identified by the VPCI + // instance GUID containing `c05b`). + let host_requested_keepalive = + nvme_keepalive && self.context.save_restore_supported; + let device_keepalive = + host_requested_keepalive && is_nvme_keepalive_compatible(&pci_id); + if host_requested_keepalive && !device_keepalive { + tracing::info!( + %pci_id, + "disabling nvme keepalive for non-ASAP device; \ + falling back to reset-on-servicing" + ); + } driver .shutdown(NvmeDriverShutdownOptions { - // nvme_keepalive is received from host but it is only valid - // when memory pool allocator supports save/restore. - do_not_reset: nvme_keepalive && self.context.save_restore_supported, - skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported, + do_not_reset: device_keepalive, + skip_device_shutdown: device_keepalive, }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -419,12 +435,27 @@ impl NvmeManagerWorker { /// Saves NVMe device's states into buffer during servicing. pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); + // Only save state for devices known to be keepalive-compatible. + // Non-ASAP (NVMe Direct v2) devices will be reset on servicing, so + // persisting their state would be a waste and could conflict with + // the reset path on restore. let mut devices_to_save: HashMap = self .context .devices .write() .iter() - .map(|(pci_id, driver)| (pci_id.clone(), driver.client().clone())) + .filter_map(|(pci_id, driver)| { + if is_nvme_keepalive_compatible(pci_id) { + Some((pci_id.clone(), driver.client().clone())) + } else { + tracing::info!( + %pci_id, + "skipping save of non-ASAP nvme device; \ + keepalive disabled for this device" + ); + None + } + }) .collect(); for (pci_id, client) in devices_to_save.iter_mut() { nvme_disks.push(NvmeSavedDiskConfig { diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index cc8977de45..9ac604f0a9 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -111,3 +111,20 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>, ) -> Result, NvmeSpawnerError>; } + +/// Returns whether NVMe keepalive (servicing without device reset) is +/// known to be compatible with the device identified by this VPCI +/// instance ID. +/// +/// NVMe keepalive currently has known incompatibilities with NVMe Direct +/// v2 devices. As a partial mitigation, we only enable keepalive for +/// ASAP devices, which can be identified by the substring `c05b` in the +/// VPCI instance GUID. All other devices (NVMe Direct v2) fall back to +/// the legacy reset-on-servicing path even when the host-level keepalive +/// flag is enabled. +/// +/// This heuristic is intentionally fragile and is intended only as a +/// temporary mitigation while the real compatibility work is in flight. +pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { + pci_id.to_ascii_lowercase().starts_with("c05b") +} From aec5033ad44ed1f6e58b107833c038b0a1894569 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 15:54:07 -0700 Subject: [PATCH 02/17] I think this is in better shape --- .../src/nvme_manager/manager.rs | 32 +++++-------------- .../underhill_core/src/nvme_manager/mod.rs | 15 ++------- 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index d901748574..5ef43eb555 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -302,27 +302,14 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { - // nvme_keepalive is received from host but it is only valid - // when memory pool allocator supports save/restore. Further, - // as a partial mitigation for known incompatibilities between - // keepalive and NVMe Direct v2 devices, we only honor - // keepalive for ASAP devices (identified by the VPCI - // instance GUID containing `c05b`). - let host_requested_keepalive = - nvme_keepalive && self.context.save_restore_supported; - let device_keepalive = - host_requested_keepalive && is_nvme_keepalive_compatible(&pci_id); - if host_requested_keepalive && !device_keepalive { - tracing::info!( - %pci_id, - "disabling nvme keepalive for non-ASAP device; \ - falling back to reset-on-servicing" - ); - } driver .shutdown(NvmeDriverShutdownOptions { - do_not_reset: device_keepalive, - skip_device_shutdown: device_keepalive, + do_not_reset: nvme_keepalive + && self.context.save_restore_supported + && is_nvme_keepalive_compatible(&pci_id), + skip_device_shutdown: nvme_keepalive + && self.context.save_restore_supported + && !is_nvme_keepalive_compatible(&pci_id), }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -375,7 +362,7 @@ impl NvmeManagerWorker { &context.driver_source, &pci_id, context.vp_count, - context.save_restore_supported, + context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), None, // No device yet, context.nvme_driver_spawner.clone(), )?; @@ -436,9 +423,6 @@ impl NvmeManagerWorker { pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); // Only save state for devices known to be keepalive-compatible. - // Non-ASAP (NVMe Direct v2) devices will be reset on servicing, so - // persisting their state would be a waste and could conflict with - // the reset path on restore. let mut devices_to_save: HashMap = self .context .devices @@ -450,7 +434,7 @@ impl NvmeManagerWorker { } else { tracing::info!( %pci_id, - "skipping save of non-ASAP nvme device; \ + "skipping save of nvme device; \ keepalive disabled for this device" ); None diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 9ac604f0a9..33b300993d 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -112,19 +112,8 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { ) -> Result, NvmeSpawnerError>; } -/// Returns whether NVMe keepalive (servicing without device reset) is -/// known to be compatible with the device identified by this VPCI -/// instance ID. -/// -/// NVMe keepalive currently has known incompatibilities with NVMe Direct -/// v2 devices. As a partial mitigation, we only enable keepalive for -/// ASAP devices, which can be identified by the substring `c05b` in the -/// VPCI instance GUID. All other devices (NVMe Direct v2) fall back to -/// the legacy reset-on-servicing path even when the host-level keepalive -/// flag is enabled. -/// -/// This heuristic is intentionally fragile and is intended only as a -/// temporary mitigation while the real compatibility work is in flight. +/// Returns whether the given PCI ID corresponds to an NVMe device that is +/// compatible with the keepalive. pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id.to_ascii_lowercase().starts_with("c05b") } From 1c8bc557cbc8f1e03b27f64c02c81d866e2bc3a8 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 16:05:17 -0700 Subject: [PATCH 03/17] Don't start with keepalive either --- openhcl/underhill_core/src/nvme_manager/manager.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 5ef43eb555..7934dacb64 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -304,6 +304,8 @@ impl NvmeManagerWorker { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { driver .shutdown(NvmeDriverShutdownOptions { + // nvme_keepalive is received from host but it is only valid + // when memory pool allocator supports save/restore. do_not_reset: nvme_keepalive && self.context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), From 30b5bad61e623443e5c7733db402d10b049962d0 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 16:07:08 -0700 Subject: [PATCH 04/17] Setting up the flags correctly --- openhcl/underhill_core/src/nvme_manager/manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 7934dacb64..7b04d9e3bd 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -311,7 +311,7 @@ impl NvmeManagerWorker { && is_nvme_keepalive_compatible(&pci_id), skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported - && !is_nvme_keepalive_compatible(&pci_id), + && is_nvme_keepalive_compatible(&pci_id), }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) From 6076ef667d49ebbb9c75f34efa44e12b0dc55ce1 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 11:54:59 -0700 Subject: [PATCH 05/17] Added a test to verify that only ASAP devices are being reset --- .../tests/multiarch/openhcl_servicing.rs | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e3c74aa96f..6072e0907c 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1169,6 +1169,191 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } +/// Verifies the per-device NVMe keepalive gate. +/// +/// Two NVMe controllers are attached to a single VM: +/// * An "ASAP" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// makes its derived pci_id start with `c05b:` (matching the +/// keepalive-compatible heuristic). Keepalive must be honored for +/// this device — its controller is not reset across servicing, so +/// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. +/// The fault panics if the opcode is observed. +/// * An "NVMe Direct v2" device whose pci_id does not start with +/// `c05b:`. Keepalive must be downgraded to reset-on-servicing for +/// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after +/// servicing. The fault verifies (via a oneshot) that the opcode is +/// observed. +/// +/// Keepalive is enabled VM-wide; the gate is expected to apply per +/// device based on its pci_id. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_per_device_gate( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + + // VPCI instance IDs. The pci_id seen by the NVMe driver is derived + // as `format!("{:04x}:00:00.0", instance_id.data2)` (see + // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` + // controls the prefix of the pci_id. For ASAP we set data2 = 0xc05b + // so its pci_id is `c05b:00:00.0`. + const ASAP_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const ND2_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + + const ASAP_NSID: u32 = KEEPALIVE_VTL2_NSID; + const ND2_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const ASAP_LUN: u32 = VTL0_NVME_LUN; + const ND2_LUN: u32 = VTL0_NVME_LUN + 1; + + // Two independent fault start cells — one per device. + let mut asap_fault_updater = CellUpdater::new(false); + let mut nd2_fault_updater = CellUpdater::new(false); + + let (nd2_create_seen_send, nd2_create_seen_recv) = mesh::oneshot::<()>(); + + // ASAP (c05b) device: keepalive must be honored — fail loudly if any + // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. + let asap_fault_config = FaultConfiguration::new(asap_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Panic( + "ASAP (c05b) device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + keepalive should have been honored for this device but the controller \ + was reset." + .to_string(), + ), + ), + ); + + // NVMe Direct v2 (non-c05b) device: keepalive must be downgraded — + // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. + let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Verify(Some(nd2_create_seen_send)), + ), + ); + + let scsi_instance = Guid::new_random(); + + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + ) + .modify_backend(move |b| { + b.with_custom_config(move |c| { + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: ASAP_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: ASAP_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: asap_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: ND2_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: ND2_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: nd2_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + }) + }) + .add_vtl2_storage_controller( + Vtl2StorageControllerBuilder::new(ControllerType::Scsi) + .with_instance_id(scsi_instance) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(ASAP_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + ASAP_NVME_INSTANCE, + ASAP_NSID, + )), + ) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(ND2_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + ND2_NVME_INSTANCE, + ND2_NSID, + )), + ) + .build(), + ) + .run() + .await?; + + agent.ping().await?; + + // Arm both faults BEFORE servicing so that the post-servicing + // CREATE_IO_COMPLETION_QUEUE is what each fault matches. + asap_fault_updater.set(true).await; + nd2_fault_updater.set(true).await; + + vm.restart_openhcl(igvm_file.clone(), flags).await?; + + agent.ping().await?; + + // The non-c05b device must issue CREATE_IO_COMPLETION_QUEUE after + // servicing because its keepalive was downgraded to reset. + CancelContext::new() + .with_timeout(Duration::from_secs(60)) + .until_cancelled(nd2_create_seen_recv) + .await + .expect( + "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ + servicing — the per-device keepalive gate did not downgrade keepalive for this \ + device.", + ) + .expect("CREATE_IO_COMPLETION_QUEUE verification on non-c05b device failed"); + + // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the + // panic fault would have crashed the VM and failed this test. We + // disarm the faults defensively before tearing down. + asap_fault_updater.set(false).await; + nd2_fault_updater.set(false).await; + + Ok(()) +} + async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From 1e19679ae91be3471fb92e09a1e0fba7a53b3f7c Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:01:57 -0700 Subject: [PATCH 06/17] Update comments --- .../vmm_tests/tests/tests/multiarch/openhcl_servicing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 6072e0907c..2696f462b3 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1178,7 +1178,7 @@ async fn servicing_keepalive_slow_create_io_queue( /// this device — its controller is not reset across servicing, so /// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. /// The fault panics if the opcode is observed. -/// * An "NVMe Direct v2" device whose pci_id does not start with +/// * A device whose pci_id does not start with /// `c05b:`. Keepalive must be downgraded to reset-on-servicing for /// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after /// servicing. The fault verifies (via a oneshot) that the opcode is @@ -1230,7 +1230,7 @@ async fn servicing_keepalive_per_device_gate( ), ); - // NVMe Direct v2 (non-c05b) device: keepalive must be downgraded — + // Non c05b device: keepalive must be downgraded — // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) .with_admin_queue_fault( From fc20c570ad5097794b540506d20400f35b9d7a8d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:17:55 -0700 Subject: [PATCH 07/17] Added comment fix and removed magic numbers out of the function definition --- openhcl/underhill_core/src/nvme_manager/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 33b300993d..b0adf280e0 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,6 +67,10 @@ pub struct NamespaceError { source: NvmeSpawnerError, } +/// PCI vendor-ID prefix for NVMe devices that support keepalive, including +/// the expected `:` separator in the `dddd:bb:dd.f` PCI ID format. +const KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX: &str = "c05b:"; + #[derive(Debug, Error)] pub enum NvmeSpawnerError { #[error("failed to initialize vfio device")] @@ -114,6 +118,9 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. +/// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { - pci_id.to_ascii_lowercase().starts_with("c05b") + pci_id + .to_ascii_lowercase() + .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) } From 4c15d5d586a1539036e95e77e7ea6d0faf0fcb43 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:32:33 -0700 Subject: [PATCH 08/17] More variable name cleanup --- .../underhill_core/src/nvme_manager/mod.rs | 2 +- .../tests/multiarch/openhcl_servicing.rs | 66 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index b0adf280e0..d2540d7d76 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -119,7 +119,7 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. /// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary -pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { +pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id .to_ascii_lowercase() .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 2696f462b3..3c654d51dc 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1172,7 +1172,7 @@ async fn servicing_keepalive_slow_create_io_queue( /// Verifies the per-device NVMe keepalive gate. /// /// Two NVMe controllers are attached to a single VM: -/// * An "ASAP" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// * A "c05b" device whose VPCI instance ID has `data2 = 0xc05b`, which /// makes its derived pci_id start with `c05b:` (matching the /// keepalive-compatible heuristic). Keepalive must be honored for /// this device — its controller is not reset across servicing, so @@ -1197,32 +1197,32 @@ async fn servicing_keepalive_per_device_gate( // VPCI instance IDs. The pci_id seen by the NVMe driver is derived // as `format!("{:04x}:00:00.0", instance_id.data2)` (see // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` - // controls the prefix of the pci_id. For ASAP we set data2 = 0xc05b + // controls the prefix of the pci_id. For the device with keepalive we set data2 = 0xc05b // so its pci_id is `c05b:00:00.0`. - const ASAP_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); - const ND2_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + const WITH_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const NO_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - const ASAP_NSID: u32 = KEEPALIVE_VTL2_NSID; - const ND2_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; - const ASAP_LUN: u32 = VTL0_NVME_LUN; - const ND2_LUN: u32 = VTL0_NVME_LUN + 1; + const WITH_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID; + const NO_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const WITH_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN; + const NO_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN + 1; // Two independent fault start cells — one per device. - let mut asap_fault_updater = CellUpdater::new(false); - let mut nd2_fault_updater = CellUpdater::new(false); + let mut with_keepalive_fault_updater = CellUpdater::new(false); + let mut no_keepalive_fault_updater = CellUpdater::new(false); - let (nd2_create_seen_send, nd2_create_seen_recv) = mesh::oneshot::<()>(); + let (no_keepalive_create_seen_send, no_keepalive_create_seen_recv) = mesh::oneshot::<()>(); - // ASAP (c05b) device: keepalive must be honored — fail loudly if any + // c05b device: keepalive must be honored — fail loudly if any // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. - let asap_fault_config = FaultConfiguration::new(asap_fault_updater.cell()) + let with_keepalive_fault_config = FaultConfiguration::new(with_keepalive_fault_updater.cell()) .with_admin_queue_fault( AdminQueueFaultConfig::new().with_submission_queue_fault( CommandMatchBuilder::new() .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) .build(), AdminQueueFaultBehavior::Panic( - "ASAP (c05b) device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + "c05b device received CREATE_IO_COMPLETION_QUEUE after servicing — \ keepalive should have been honored for this device but the controller \ was reset." .to_string(), @@ -1232,13 +1232,13 @@ async fn servicing_keepalive_per_device_gate( // Non c05b device: keepalive must be downgraded — // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. - let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) + let no_keepalive_fault_config = FaultConfiguration::new(no_keepalive_fault_updater.cell()) .with_admin_queue_fault( AdminQueueFaultConfig::new().with_submission_queue_fault( CommandMatchBuilder::new() .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) .build(), - AdminQueueFaultBehavior::Verify(Some(nd2_create_seen_send)), + AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), ), ); @@ -1253,13 +1253,13 @@ async fn servicing_keepalive_per_device_gate( b.with_custom_config(move |c| { c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: ASAP_NVME_INSTANCE, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: ASAP_NSID, + nsid: WITH_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1267,20 +1267,20 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: asap_fault_config, + fault_config: with_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), }); c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: ND2_NVME_INSTANCE, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: ND2_NSID, + nsid: NO_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1288,7 +1288,7 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: nd2_fault_config, + fault_config: no_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), @@ -1300,20 +1300,20 @@ async fn servicing_keepalive_per_device_gate( .with_instance_id(scsi_instance) .add_lun( Vtl2LunBuilder::disk() - .with_location(ASAP_LUN) + .with_location(WITH_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - ASAP_NVME_INSTANCE, - ASAP_NSID, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, )), ) .add_lun( Vtl2LunBuilder::disk() - .with_location(ND2_LUN) + .with_location(NO_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - ND2_NVME_INSTANCE, - ND2_NSID, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, )), ) .build(), @@ -1325,8 +1325,8 @@ async fn servicing_keepalive_per_device_gate( // Arm both faults BEFORE servicing so that the post-servicing // CREATE_IO_COMPLETION_QUEUE is what each fault matches. - asap_fault_updater.set(true).await; - nd2_fault_updater.set(true).await; + with_keepalive_fault_updater.set(true).await; + no_keepalive_fault_updater.set(true).await; vm.restart_openhcl(igvm_file.clone(), flags).await?; @@ -1336,7 +1336,7 @@ async fn servicing_keepalive_per_device_gate( // servicing because its keepalive was downgraded to reset. CancelContext::new() .with_timeout(Duration::from_secs(60)) - .until_cancelled(nd2_create_seen_recv) + .until_cancelled(no_keepalive_create_seen_recv) .await .expect( "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ @@ -1348,8 +1348,8 @@ async fn servicing_keepalive_per_device_gate( // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the // panic fault would have crashed the VM and failed this test. We // disarm the faults defensively before tearing down. - asap_fault_updater.set(false).await; - nd2_fault_updater.set(false).await; + with_keepalive_fault_updater.set(false).await; + no_keepalive_fault_updater.set(false).await; Ok(()) } From 4c9456c82e152ccfe46312caca93e659d3c30b3d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 13:22:00 -0700 Subject: [PATCH 09/17] Fixed comments --- openhcl/underhill_core/src/nvme_manager/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index d2540d7d76..0d8b2a620c 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,9 +67,9 @@ pub struct NamespaceError { source: NvmeSpawnerError, } -/// PCI vendor-ID prefix for NVMe devices that support keepalive, including -/// the expected `:` separator in the `dddd:bb:dd.f` PCI ID format. -const KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX: &str = "c05b:"; +/// Device prefix for nvme devices that are compatibble with keepalive. +/// the "dddd:" in the `dddd:bb:dd.f` PCI ID format. +const KEEPALIVE_COMPATIBLE_DEVICE_PREFIX: &str = "c05b:"; #[derive(Debug, Error)] pub enum NvmeSpawnerError { @@ -118,9 +118,9 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. -/// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary +/// DEV_NOTE: This is a weak heuristic based approach which is not ideal but is necessary pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id .to_ascii_lowercase() - .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) + .starts_with(KEEPALIVE_COMPATIBLE_DEVICE_PREFIX) } From dbab2149d7599cd394b3f18c11bdc64beecb83ab Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 13:30:06 -0700 Subject: [PATCH 10/17] Will no longer restore devices that are not keepalive comptible. Moreover, it will tear down that device state and start again ... at least it should --- openhcl/underhill_core/src/nvme_manager/manager.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 7b04d9e3bd..ee14680a94 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -465,6 +465,9 @@ impl NvmeManagerWorker { let mut restored_devices: HashMap = HashMap::new(); for disk in &saved_state.nvme_disks { + // Only restore disks that are known to be keepalive compatible. + // This should gracefully tear down state for non-keepalive devices + // before using the device. let pci_id = disk.pci_id.clone(); let nvme_driver = self .context @@ -473,7 +476,7 @@ impl NvmeManagerWorker { &self.context.driver_source, &pci_id, saved_state.cpu_count, - save_restore_supported, + save_restore_supported && is_nvme_keepalive_compatible(&pci_id), Some(&disk.driver_state), ) .await?; From 6412280ee8d66b6b19c3dc53ef346f7ab73adf21 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 16:55:20 -0700 Subject: [PATCH 11/17] Now using vendor and device ID information instead of using pci ID to figure out keepalive info --- .../underhill_core/src/nvme_manager/device.rs | 10 +++ .../src/nvme_manager/manager.rs | 37 +++++++--- .../underhill_core/src/nvme_manager/mod.rs | 55 ++++++++++++--- .../storage/nvme_resources/src/fault.rs | 68 +++++++++++++++++++ vm/devices/storage/nvme_test/src/pci.rs | 15 +++- .../tests/multiarch/openhcl_servicing.rs | 30 ++++---- 6 files changed, 184 insertions(+), 31 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index 3fb4892297..e9bda747b1 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -334,6 +334,9 @@ enum NvmeDriverRequest { pub struct NvmeDriverManager { task: Task<()>, pci_id: String, + /// Whether the underlying device is compatible with NVMe keepalive + /// (cached at construction time so callers don't have to re-read sysfs). + keepalive_compatible: bool, client: NvmeDriverManagerClient, } @@ -358,12 +361,18 @@ impl NvmeDriverManager { &self.client } + /// Returns whether the underlying device is compatible with NVMe keepalive. + pub fn keepalive_compatible(&self) -> bool { + self.keepalive_compatible + } + /// Creates the [`NvmeDriverManager`]. pub fn new( driver_source: &VmTaskDriverSource, pci_id: &str, vp_count: u32, save_restore_supported: bool, + keepalive_compatible: bool, device: Option>, nvme_driver_spawner: Arc, ) -> anyhow::Result { @@ -382,6 +391,7 @@ impl NvmeDriverManager { Ok(Self { task, pci_id: pci_id.into(), + keepalive_compatible, client: NvmeDriverManagerClient { pci_id: pci_id.into(), sender: send, diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index ee14680a94..13f86430a7 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -302,16 +302,17 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { + let keepalive_compatible = driver.keepalive_compatible(); driver .shutdown(NvmeDriverShutdownOptions { // nvme_keepalive is received from host but it is only valid // when memory pool allocator supports save/restore. do_not_reset: nvme_keepalive && self.context.save_restore_supported - && is_nvme_keepalive_compatible(&pci_id), + && keepalive_compatible, skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported - && is_nvme_keepalive_compatible(&pci_id), + && keepalive_compatible, }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -360,11 +361,17 @@ impl NvmeManagerWorker { match guard.entry(pci_id.clone()) { hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. hash_map::Entry::Vacant(entry) => { + // Read the PCI vendor/device IDs once and cache the + // resulting keepalive compatibility flag on the + // `NvmeDriverManager` for later save/shutdown queries. + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let driver = NvmeDriverManager::new( &context.driver_source, &pci_id, context.vp_count, - context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), + context.save_restore_supported && keepalive_compatible, + keepalive_compatible, None, // No device yet, context.nvme_driver_spawner.clone(), )?; @@ -431,7 +438,7 @@ impl NvmeManagerWorker { .write() .iter() .filter_map(|(pci_id, driver)| { - if is_nvme_keepalive_compatible(pci_id) { + if driver.keepalive_compatible() { Some((pci_id.clone(), driver.client().clone())) } else { tracing::info!( @@ -469,6 +476,11 @@ impl NvmeManagerWorker { // This should gracefully tear down state for non-keepalive devices // before using the device. let pci_id = disk.pci_id.clone(); + + // Read the PCI vendor/device IDs once and cache the resulting + // keepalive compatibility flag on the `NvmeDriverManager`. + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let nvme_driver = self .context .nvme_driver_spawner @@ -476,7 +488,7 @@ impl NvmeManagerWorker { &self.context.driver_source, &pci_id, saved_state.cpu_count, - save_restore_supported && is_nvme_keepalive_compatible(&pci_id), + save_restore_supported && keepalive_compatible, Some(&disk.driver_state), ) .await?; @@ -488,6 +500,7 @@ impl NvmeManagerWorker { &pci_id, self.context.vp_count, true, // save_restore_supported is always `true` when restoring. + keepalive_compatible, Some(nvme_driver), self.context.nvme_driver_spawner.clone(), )?, @@ -1088,9 +1101,16 @@ mod tests { )); // Create a driver manager - let driver_manager = - NvmeDriverManager::new(&driver_source, "0000:00:04.0", 4, false, None, spawner) - .unwrap(); + let driver_manager = NvmeDriverManager::new( + &driver_source, + "0000:00:04.0", + 4, + false, + false, + None, + spawner, + ) + .unwrap(); let client = driver_manager.client().clone(); @@ -1174,6 +1194,7 @@ mod tests { "0000:00:05.0", 4, true, // save_restore_supported + true, None, spawner, ) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 0d8b2a620c..09bd259ad2 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,9 +67,13 @@ pub struct NamespaceError { source: NvmeSpawnerError, } -/// Device prefix for nvme devices that are compatibble with keepalive. -/// the "dddd:" in the `dddd:bb:dd.f` PCI ID format. -const KEEPALIVE_COMPATIBLE_DEVICE_PREFIX: &str = "c05b:"; +/// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`), +/// for NVMe devices that are compatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; + +/// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), +/// for NVMe devices that are compatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; #[derive(Debug, Error)] pub enum NvmeSpawnerError { @@ -116,11 +120,44 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { ) -> Result, NvmeSpawnerError>; } -/// Returns whether the given PCI ID corresponds to an NVMe device that is -/// compatible with the keepalive. -/// DEV_NOTE: This is a weak heuristic based approach which is not ideal but is necessary +/// Returns whether the given PCI device is compatible with NVMe keepalive. pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { - pci_id - .to_ascii_lowercase() - .starts_with(KEEPALIVE_COMPATIBLE_DEVICE_PREFIX) + match read_pci_vendor_device_ids(&pci_id) { + Ok((vendor_id, device_id)) => { + vendor_id != KEEPALIVE_INCOMPATIBLE_VENDOR_ID + || device_id != KEEPALIVE_INCOMPATIBLE_DEVICE_ID + } + Err(err) => { + tracing::warn!( + pci_id = %pci_id, + error = err.as_ref() as &dyn std::error::Error, + "failed to read PCI vendor/device IDs; treating device as not keepalive-compatible" + ); + false + } + } +} + +/// Reads the sysfs `vendor` and `device` files for the given PCI device, +/// returning the trimmed contents (e.g. `"0x0100"`). +/// +/// Callers should invoke this once per device and cache the result, since +/// the values do not change for the lifetime of the device. +fn read_pci_vendor_device_ids(pci_id: &str) -> anyhow::Result<(String, String)> { + let devpath = std::path::Path::new("/sys/bus/pci/devices").join(pci_id); + let vendor = fs_err::read_to_string(devpath.join("vendor"))? + .trim_end() + .to_owned(); + let device = fs_err::read_to_string(devpath.join("device"))? + .trim_end() + .to_owned(); + + tracing::info!( + pci_id = %pci_id, + vendor = %vendor, + device = %device, + "read PCI vendor/device IDs" + ); + + Ok((vendor, device)) } diff --git a/vm/devices/storage/nvme_resources/src/fault.rs b/vm/devices/storage/nvme_resources/src/fault.rs index 8fcad86766..8ed05989b6 100644 --- a/vm/devices/storage/nvme_resources/src/fault.rs +++ b/vm/devices/storage/nvme_resources/src/fault.rs @@ -315,6 +315,42 @@ pub struct CommandMatch { pub mask: [u8; 64], } +/// A fault configuration for overriding the device's hardware configuration +/// (i.e. the values reported in PCI configuration space). +/// +/// When a field is `Some`, its value replaces the corresponding field reported +/// by the NVMe fault controller's PCI hardware IDs. When `None`, the real +/// hardware ID value is used unchanged. Unlike most other faults in this +/// module, hardware config faults are applied at device construction time +/// and are therefore not gated by the `fault_active` flag. +/// +/// # Example +/// Override the Vendor ID and Device ID reported in PCI config space. +/// ```no_run +/// use mesh::CellUpdater; +/// use nvme_resources::fault::FaultConfiguration; +/// use nvme_resources::fault::HardwareConfigFaultConfig; +/// +/// pub fn hardware_config_fault() -> FaultConfiguration { +/// let mut fault_start_updater = CellUpdater::new(false); +/// FaultConfiguration::new(fault_start_updater.cell()) +/// .with_hardware_config_fault( +/// HardwareConfigFaultConfig::new() +/// .with_vendor_id(0x1414) +/// .with_device_id(0x00a9), +/// ) +/// } +/// ``` +#[derive(Debug, Copy, Clone, Default, MeshPayload)] +pub struct HardwareConfigFaultConfig { + /// Override for the device's PCI Vendor ID. When `None`, the real Vendor + /// ID is used. + pub vendor_id: Option, + /// Override for the device's PCI Device ID. When `None`, the real Device + /// ID is used. + pub device_id: Option, +} + /// Fault configuration for the NVMe fault controller. /// /// This struct defines behaviors that inject faults into the NVMe fault controller logic, @@ -390,6 +426,8 @@ pub struct FaultConfiguration { pub namespace_fault: NamespaceFaultConfig, /// Fault to apply to all IO queues pub io_fault: Arc, + /// Fault to apply to the Hardware Configuration + pub hardware_config_fault: Option, } impl FaultConfiguration { @@ -404,6 +442,7 @@ impl FaultConfiguration { pci_fault: Some(PciFaultConfig::new()), namespace_fault: NamespaceFaultConfig::new(mesh::channel().1), io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)), + hardware_config_fault: None, } } @@ -430,6 +469,35 @@ impl FaultConfiguration { self.namespace_fault = namespace_fault; self } + + /// Add a hardware config fault configuration to the fault configuration + pub fn with_hardware_config_fault( + mut self, + hardware_config_fault: HardwareConfigFaultConfig, + ) -> Self { + self.hardware_config_fault = Some(hardware_config_fault); + self + } +} + +impl HardwareConfigFaultConfig { + /// Create a new no-op hardware config fault configuration. All fields + /// default to `None`, meaning the real hardware values are used. + pub fn new() -> Self { + Self::default() + } + + /// Override the PCI Vendor ID reported by the device. + pub fn with_vendor_id(mut self, vendor_id: u16) -> Self { + self.vendor_id = Some(vendor_id); + self + } + + /// Override the PCI Device ID reported by the device. + pub fn with_device_id(mut self, device_id: u16) -> Self { + self.device_id = Some(device_id); + self + } } impl PciFaultConfig { diff --git a/vm/devices/storage/nvme_test/src/pci.rs b/vm/devices/storage/nvme_test/src/pci.rs index 278793d3e8..396951f19b 100644 --- a/vm/devices/storage/nvme_test/src/pci.rs +++ b/vm/devices/storage/nvme_test/src/pci.rs @@ -137,10 +137,21 @@ impl NvmeFaultController { BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())), ); + // Apply any hardware-config fault overrides for the IDs reported in + // PCI configuration space, falling back to the real values when no + // override is configured. + let hardware_config_fault = fault_configuration.hardware_config_fault.take(); + let vendor_id = hardware_config_fault + .and_then(|f| f.vendor_id) + .unwrap_or(VENDOR_ID); + let device_id = hardware_config_fault + .and_then(|f| f.device_id) + .unwrap_or(0x00a9); + let cfg_space = ConfigSpaceType0Emulator::new( HardwareIds { - vendor_id: VENDOR_ID, - device_id: 0x00a9, + vendor_id, + device_id, revision_id: 0, prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME, sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY, diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 3c654d51dc..c1055f3cc4 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -19,6 +19,7 @@ use nvme_resources::NvmeFaultControllerHandle; use nvme_resources::fault::AdminQueueFaultBehavior; use nvme_resources::fault::AdminQueueFaultConfig; use nvme_resources::fault::FaultConfiguration; +use nvme_resources::fault::HardwareConfigFaultConfig; use nvme_resources::fault::IoQueueFaultBehavior; use nvme_resources::fault::IoQueueFaultConfig; use nvme_resources::fault::NamespaceChange; @@ -1240,6 +1241,11 @@ async fn servicing_keepalive_per_device_gate( .build(), AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), ), + ) + .with_hardware_config_fault( + HardwareConfigFaultConfig::new() + .with_vendor_id(0x1414) + .with_device_id(0xb111), ); let scsi_instance = Guid::new_random(); @@ -1253,13 +1259,13 @@ async fn servicing_keepalive_per_device_gate( b.with_custom_config(move |c| { c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: WITH_KEEPALIVE_NVME_INSTANCE, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: WITH_KEEPALIVE_NSID, + nsid: NO_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1267,20 +1273,20 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: with_keepalive_fault_config, + fault_config: no_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), }); c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: NO_KEEPALIVE_NVME_INSTANCE, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: NO_KEEPALIVE_NSID, + nsid: WITH_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1288,7 +1294,7 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: no_keepalive_fault_config, + fault_config: with_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), @@ -1300,20 +1306,20 @@ async fn servicing_keepalive_per_device_gate( .with_instance_id(scsi_instance) .add_lun( Vtl2LunBuilder::disk() - .with_location(WITH_KEEPALIVE_LUN) + .with_location(NO_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - WITH_KEEPALIVE_NVME_INSTANCE, - WITH_KEEPALIVE_NSID, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, )), ) .add_lun( Vtl2LunBuilder::disk() - .with_location(NO_KEEPALIVE_LUN) + .with_location(WITH_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - NO_KEEPALIVE_NVME_INSTANCE, - NO_KEEPALIVE_NSID, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, )), ) .build(), From dea6e9e93ff1320c793f5f90ddcd31f49944b00d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 17:02:50 -0700 Subject: [PATCH 12/17] Minor updates to comments and such --- openhcl/underhill_core/src/nvme_manager/device.rs | 2 -- openhcl/underhill_core/src/nvme_manager/manager.rs | 6 ------ openhcl/underhill_core/src/nvme_manager/mod.rs | 4 ++-- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index e9bda747b1..cf57f37a75 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -334,8 +334,6 @@ enum NvmeDriverRequest { pub struct NvmeDriverManager { task: Task<()>, pci_id: String, - /// Whether the underlying device is compatible with NVMe keepalive - /// (cached at construction time so callers don't have to re-read sysfs). keepalive_compatible: bool, client: NvmeDriverManagerClient, } diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 13f86430a7..910a113f12 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -361,9 +361,6 @@ impl NvmeManagerWorker { match guard.entry(pci_id.clone()) { hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. hash_map::Entry::Vacant(entry) => { - // Read the PCI vendor/device IDs once and cache the - // resulting keepalive compatibility flag on the - // `NvmeDriverManager` for later save/shutdown queries. let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); let driver = NvmeDriverManager::new( @@ -472,9 +469,6 @@ impl NvmeManagerWorker { let mut restored_devices: HashMap = HashMap::new(); for disk in &saved_state.nvme_disks { - // Only restore disks that are known to be keepalive compatible. - // This should gracefully tear down state for non-keepalive devices - // before using the device. let pci_id = disk.pci_id.clone(); // Read the PCI vendor/device IDs once and cache the resulting diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 09bd259ad2..9244f7ba50 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -68,11 +68,11 @@ pub struct NamespaceError { } /// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`), -/// for NVMe devices that are compatible with keepalive. +/// for NVMe devices that are incompatible with keepalive. const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; /// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), -/// for NVMe devices that are compatible with keepalive. +/// for NVMe devices that are incompatible with keepalive. const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; #[derive(Debug, Error)] From 7d4c3e236faebb874097529208cb4113649d01d8 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 17:37:43 -0700 Subject: [PATCH 13/17] Remove per device gate test --- .../tests/multiarch/openhcl_servicing.rs | 190 ------------------ 1 file changed, 190 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index c1055f3cc4..0f2cbf68e2 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1170,196 +1170,6 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } -/// Verifies the per-device NVMe keepalive gate. -/// -/// Two NVMe controllers are attached to a single VM: -/// * A "c05b" device whose VPCI instance ID has `data2 = 0xc05b`, which -/// makes its derived pci_id start with `c05b:` (matching the -/// keepalive-compatible heuristic). Keepalive must be honored for -/// this device — its controller is not reset across servicing, so -/// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. -/// The fault panics if the opcode is observed. -/// * A device whose pci_id does not start with -/// `c05b:`. Keepalive must be downgraded to reset-on-servicing for -/// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after -/// servicing. The fault verifies (via a oneshot) that the opcode is -/// observed. -/// -/// Keepalive is enabled VM-wide; the gate is expected to apply per -/// device based on its pci_id. -#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] -async fn servicing_keepalive_per_device_gate( - config: PetriVmBuilder, - (igvm_file,): (ResolvedArtifact,), -) -> Result<(), anyhow::Error> { - let mut flags = config.default_servicing_flags(); - flags.enable_nvme_keepalive = true; - - // VPCI instance IDs. The pci_id seen by the NVMe driver is derived - // as `format!("{:04x}:00:00.0", instance_id.data2)` (see - // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` - // controls the prefix of the pci_id. For the device with keepalive we set data2 = 0xc05b - // so its pci_id is `c05b:00:00.0`. - const WITH_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); - const NO_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - - const WITH_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID; - const NO_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; - const WITH_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN; - const NO_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN + 1; - - // Two independent fault start cells — one per device. - let mut with_keepalive_fault_updater = CellUpdater::new(false); - let mut no_keepalive_fault_updater = CellUpdater::new(false); - - let (no_keepalive_create_seen_send, no_keepalive_create_seen_recv) = mesh::oneshot::<()>(); - - // c05b device: keepalive must be honored — fail loudly if any - // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. - let with_keepalive_fault_config = FaultConfiguration::new(with_keepalive_fault_updater.cell()) - .with_admin_queue_fault( - AdminQueueFaultConfig::new().with_submission_queue_fault( - CommandMatchBuilder::new() - .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) - .build(), - AdminQueueFaultBehavior::Panic( - "c05b device received CREATE_IO_COMPLETION_QUEUE after servicing — \ - keepalive should have been honored for this device but the controller \ - was reset." - .to_string(), - ), - ), - ); - - // Non c05b device: keepalive must be downgraded — - // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. - let no_keepalive_fault_config = FaultConfiguration::new(no_keepalive_fault_updater.cell()) - .with_admin_queue_fault( - AdminQueueFaultConfig::new().with_submission_queue_fault( - CommandMatchBuilder::new() - .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) - .build(), - AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), - ), - ) - .with_hardware_config_fault( - HardwareConfigFaultConfig::new() - .with_vendor_id(0x1414) - .with_device_id(0xb111), - ); - - let scsi_instance = Guid::new_random(); - - let (mut vm, agent) = config - .with_vmbus_redirect(true) - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) - .modify_backend(move |b| { - b.with_custom_config(move |c| { - c.vpci_devices.push(VpciDeviceConfig { - vtl: DeviceVtl::Vtl2, - instance_id: NO_KEEPALIVE_NVME_INSTANCE, - resource: NvmeFaultControllerHandle { - subsystem_id: Guid::new_random(), - msix_count: 10, - max_io_queues: 10, - namespaces: vec![NamespaceDefinition { - nsid: NO_KEEPALIVE_NSID, - read_only: false, - disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { - len: Some(DEFAULT_DISK_SIZE), - sector_size: None, - }) - .into_resource(), - }], - fault_config: no_keepalive_fault_config, - enable_tdisp_tests: false, - } - .into_resource(), - }); - c.vpci_devices.push(VpciDeviceConfig { - vtl: DeviceVtl::Vtl2, - instance_id: WITH_KEEPALIVE_NVME_INSTANCE, - resource: NvmeFaultControllerHandle { - subsystem_id: Guid::new_random(), - msix_count: 10, - max_io_queues: 10, - namespaces: vec![NamespaceDefinition { - nsid: WITH_KEEPALIVE_NSID, - read_only: false, - disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { - len: Some(DEFAULT_DISK_SIZE), - sector_size: None, - }) - .into_resource(), - }], - fault_config: with_keepalive_fault_config, - enable_tdisp_tests: false, - } - .into_resource(), - }); - }) - }) - .add_vtl2_storage_controller( - Vtl2StorageControllerBuilder::new(ControllerType::Scsi) - .with_instance_id(scsi_instance) - .add_lun( - Vtl2LunBuilder::disk() - .with_location(NO_KEEPALIVE_LUN) - .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( - ControllerType::Nvme, - NO_KEEPALIVE_NVME_INSTANCE, - NO_KEEPALIVE_NSID, - )), - ) - .add_lun( - Vtl2LunBuilder::disk() - .with_location(WITH_KEEPALIVE_LUN) - .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( - ControllerType::Nvme, - WITH_KEEPALIVE_NVME_INSTANCE, - WITH_KEEPALIVE_NSID, - )), - ) - .build(), - ) - .run() - .await?; - - agent.ping().await?; - - // Arm both faults BEFORE servicing so that the post-servicing - // CREATE_IO_COMPLETION_QUEUE is what each fault matches. - with_keepalive_fault_updater.set(true).await; - no_keepalive_fault_updater.set(true).await; - - vm.restart_openhcl(igvm_file.clone(), flags).await?; - - agent.ping().await?; - - // The non-c05b device must issue CREATE_IO_COMPLETION_QUEUE after - // servicing because its keepalive was downgraded to reset. - CancelContext::new() - .with_timeout(Duration::from_secs(60)) - .until_cancelled(no_keepalive_create_seen_recv) - .await - .expect( - "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ - servicing — the per-device keepalive gate did not downgrade keepalive for this \ - device.", - ) - .expect("CREATE_IO_COMPLETION_QUEUE verification on non-c05b device failed"); - - // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the - // panic fault would have crashed the VM and failed this test. We - // disarm the faults defensively before tearing down. - with_keepalive_fault_updater.set(false).await; - no_keepalive_fault_updater.set(false).await; - - Ok(()) -} - async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From 0af4f8e4da0164a77850d7ec139fc4227375e8bd Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 17:39:58 -0700 Subject: [PATCH 14/17] Added test gated to nvme direct 2 devices back in --- .../tests/multiarch/openhcl_servicing.rs | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 0e47989e23..f7ada33715 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1315,6 +1315,196 @@ async fn servicing_keepalive_slow_create_io_queue_with_inspect( Ok(()) } +/// Verifies the per-device NVMe keepalive gate. +/// +/// Two NVMe controllers are attached to a single VM: +/// * A "c05b" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// makes its derived pci_id start with `c05b:` (matching the +/// keepalive-compatible heuristic). Keepalive must be honored for +/// this device — its controller is not reset across servicing, so +/// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. +/// The fault panics if the opcode is observed. +/// * A device whose pci_id does not start with +/// `c05b:`. Keepalive must be downgraded to reset-on-servicing for +/// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after +/// servicing. The fault verifies (via a oneshot) that the opcode is +/// observed. +/// +/// Keepalive is enabled VM-wide; the gate is expected to apply per +/// device based on its pci_id. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_per_device_gate( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + + // VPCI instance IDs. The pci_id seen by the NVMe driver is derived + // as `format!("{:04x}:00:00.0", instance_id.data2)` (see + // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` + // controls the prefix of the pci_id. For the device with keepalive we set data2 = 0xc05b + // so its pci_id is `c05b:00:00.0`. + const WITH_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const NO_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + + const WITH_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID; + const NO_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const WITH_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN; + const NO_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN + 1; + + // Two independent fault start cells — one per device. + let mut with_keepalive_fault_updater = CellUpdater::new(false); + let mut no_keepalive_fault_updater = CellUpdater::new(false); + + let (no_keepalive_create_seen_send, no_keepalive_create_seen_recv) = mesh::oneshot::<()>(); + + // c05b device: keepalive must be honored — fail loudly if any + // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. + let with_keepalive_fault_config = FaultConfiguration::new(with_keepalive_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Panic( + "c05b device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + keepalive should have been honored for this device but the controller \ + was reset." + .to_string(), + ), + ), + ); + + // Non c05b device: keepalive must be downgraded — + // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. + let no_keepalive_fault_config = FaultConfiguration::new(no_keepalive_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), + ), + ) + .with_hardware_config_fault( + HardwareConfigFaultConfig::new() + .with_vendor_id(0x1414) + .with_device_id(0xb111), + ); + + let scsi_instance = Guid::new_random(); + + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + ) + .modify_backend(move |b| { + b.with_custom_config(move |c| { + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: NO_KEEPALIVE_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: no_keepalive_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: WITH_KEEPALIVE_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: with_keepalive_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + }) + }) + .add_vtl2_storage_controller( + Vtl2StorageControllerBuilder::new(ControllerType::Scsi) + .with_instance_id(scsi_instance) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(NO_KEEPALIVE_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, + )), + ) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(WITH_KEEPALIVE_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, + )), + ) + .build(), + ) + .run() + .await?; + + agent.ping().await?; + + // Arm both faults BEFORE servicing so that the post-servicing + // CREATE_IO_COMPLETION_QUEUE is what each fault matches. + with_keepalive_fault_updater.set(true).await; + no_keepalive_fault_updater.set(true).await; + + vm.restart_openhcl(igvm_file.clone(), flags).await?; + + agent.ping().await?; + + // The non-c05b device must issue CREATE_IO_COMPLETION_QUEUE after + // servicing because its keepalive was downgraded to reset. + CancelContext::new() + .with_timeout(Duration::from_secs(60)) + .until_cancelled(no_keepalive_create_seen_recv) + .await + .expect( + "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ + servicing — the per-device keepalive gate did not downgrade keepalive for this \ + device.", + ) + .expect("CREATE_IO_COMPLETION_QUEUE verification on non-c05b device failed"); + + // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the + // panic fault would have crashed the VM and failed this test. We + // disarm the faults defensively before tearing down. + with_keepalive_fault_updater.set(false).await; + no_keepalive_fault_updater.set(false).await; + + Ok(()) +} + async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From 0a96c9df5d4eefb8a6ce4fcb965f59719b2fd2e0 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 17:43:22 -0700 Subject: [PATCH 15/17] Revert "nvme_driver: disable nvme keepalive (#3267)" This reverts commit e84570e190f22cc8ed0508c7449b346c81da5fd3. --- openhcl/openhcl_boot/src/cmdline.rs | 9 +++------ .../tests/tests/multiarch/openhcl_servicing.rs | 18 ++++++------------ .../tests/tests/x86_64/openhcl_uefi.rs | 10 +++++----- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/openhcl/openhcl_boot/src/cmdline.rs b/openhcl/openhcl_boot/src/cmdline.rs index a452b936e1..64239ecbce 100644 --- a/openhcl/openhcl_boot/src/cmdline.rs +++ b/openhcl/openhcl_boot/src/cmdline.rs @@ -132,7 +132,7 @@ impl BootCommandLineOptions { confidential_debug: false, enable_vtl2_gpa_pool: Vtl2GpaPoolConfig::Heuristics(Vtl2GpaPoolLookupTable::Release), // use the release config by default sidecar: SidecarOptions::default(), - disable_nvme_keep_alive: true, + disable_nvme_keep_alive: false, vtl2_gpa_pool_numa_split: false, } } @@ -141,9 +141,6 @@ impl BootCommandLineOptions { impl BootCommandLineOptions { /// Parse arguments from a command line. pub fn parse(&mut self, cmdline: &str) { - // Workaround for a host side issue: disable NVMe keepalive by default. - self.disable_nvme_keep_alive = true; - let mut override_vtl2_gpa_pool: Option = None; for arg in cmdline.split_whitespace() { if arg.starts_with(OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME) { @@ -186,8 +183,8 @@ impl BootCommandLineOptions { } } else if arg.starts_with(DISABLE_NVME_KEEP_ALIVE) { let arg = arg.split_once('=').map(|(_, arg)| arg); - if arg.is_some_and(|a| a == "0") { - self.disable_nvme_keep_alive = false; + if arg.is_some_and(|a| a != "0") { + self.disable_nvme_keep_alive = true; } } else if arg.starts_with(VTL2_GPA_POOL_NUMA) { if let Some((_, arg)) = arg.split_once('=') { diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index f7ada33715..af86e163d3 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -148,9 +148,7 @@ async fn servicing_keepalive_no_device( ) -> anyhow::Result<()> { let flags = config.default_servicing_flags(); openhcl_servicing_core( - config.with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ), + config.with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"), igvm_file, flags, DEFAULT_SERVICING_COUNT, @@ -199,9 +197,7 @@ async fn servicing_keepalive_with_device( let flags = config.default_servicing_flags(); openhcl_servicing_core( config - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .with_boot_device_type(petri::BootDeviceType::ScsiViaNvme) .with_vmbus_redirect(true), // Need this to attach the NVMe device igvm_file, @@ -1555,9 +1551,7 @@ async fn create_keepalive_test_config_default( config .with_vmbus_redirect(true) - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .modify_backend(move |b| { b.with_custom_config(|c| { // Add a fault controller to test the nvme controller functionality @@ -1645,9 +1639,9 @@ async fn create_keepalive_test_config_custom( ) -> Result<(PetriVm, PipetteClient), anyhow::Error> { const NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - let mut builder = config.with_vmbus_redirect(true).with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ); + let mut builder = config + .with_vmbus_redirect(true) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"); for cmdline in extra_cmdlines { builder = builder.with_openhcl_command_line(cmdline); diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs index b43f0ec855..593e78dcc6 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs @@ -193,7 +193,7 @@ async fn nvme_relay_explicit_private_pool( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512", expected_props: Some(ExpectedNvmeDeviceProperties { save_restore_supported: true, qsize: 256, // private pool should allow contiguous allocations. @@ -218,7 +218,7 @@ async fn nvme_relay_heuristic_debug_16vp_768mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024), @@ -246,7 +246,7 @@ async fn nvme_relay_heuristic_release_16vp_256mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(256 * 1024 * 1024), @@ -277,7 +277,7 @@ async fn nvme_relay_heuristic_release_32vp_500mb_very_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(500 * 1024 * 1024), @@ -307,7 +307,7 @@ async fn nvme_relay_32vp_768mb_very_heavy( ..Default::default() }), NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0 OPENHCL_ENABLE_VTL2_GPA_POOL=10240", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=10240", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024), From 4efda83fd3f175c3e606a49f635f61ff793d2c77 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 17:45:36 -0700 Subject: [PATCH 16/17] For a local build --- openhcl/underhill_core/src/nvme_manager/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 9244f7ba50..f784e179a9 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -73,7 +73,7 @@ const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; /// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), /// for NVMe devices that are incompatible with keepalive. -const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; +const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0x0000"; #[derive(Debug, Error)] pub enum NvmeSpawnerError { From 88aed69b618f45164f6bbd06a17254c4ff4cf0f4 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Wed, 6 May 2026 17:57:59 -0700 Subject: [PATCH 17/17] Revert "For a local build" This reverts commit 4efda83fd3f175c3e606a49f635f61ff793d2c77. --- openhcl/underhill_core/src/nvme_manager/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index f784e179a9..9244f7ba50 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -73,7 +73,7 @@ const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; /// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), /// for NVMe devices that are incompatible with keepalive. -const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0x0000"; +const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; #[derive(Debug, Error)] pub enum NvmeSpawnerError {