From 6e91439544812856f1b64737045742968becfed7 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Thu, 30 Apr 2026 12:12:17 -0700 Subject: [PATCH 01/13] Squashes all changes from adding-admin-queue-logging all in to a single commit to fix build pipeline issues --- openhcl/openhcl_boot/src/cmdline.rs | 9 +-- openhcl/underhill_core/src/dispatch/mod.rs | 13 ++-- .../underhill_core/src/nvme_manager/device.rs | 1 + .../disk_nvme/nvme_driver/src/driver.rs | 70 ++++++++++--------- .../disk_nvme/nvme_driver/src/queue_pair.rs | 42 +++++++++-- .../tests/multiarch/openhcl_servicing.rs | 18 ++--- .../tests/tests/x86_64/openhcl_uefi.rs | 10 +-- 7 files changed, 96 insertions(+), 67 deletions(-) diff --git a/openhcl/openhcl_boot/src/cmdline.rs b/openhcl/openhcl_boot/src/cmdline.rs index a452b936e1..64239ecbce 100644 --- a/openhcl/openhcl_boot/src/cmdline.rs +++ b/openhcl/openhcl_boot/src/cmdline.rs @@ -132,7 +132,7 @@ impl BootCommandLineOptions { confidential_debug: false, enable_vtl2_gpa_pool: Vtl2GpaPoolConfig::Heuristics(Vtl2GpaPoolLookupTable::Release), // use the release config by default sidecar: SidecarOptions::default(), - disable_nvme_keep_alive: true, + disable_nvme_keep_alive: false, vtl2_gpa_pool_numa_split: false, } } @@ -141,9 +141,6 @@ impl BootCommandLineOptions { impl BootCommandLineOptions { /// Parse arguments from a command line. pub fn parse(&mut self, cmdline: &str) { - // Workaround for a host side issue: disable NVMe keepalive by default. - self.disable_nvme_keep_alive = true; - let mut override_vtl2_gpa_pool: Option = None; for arg in cmdline.split_whitespace() { if arg.starts_with(OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME) { @@ -186,8 +183,8 @@ impl BootCommandLineOptions { } } else if arg.starts_with(DISABLE_NVME_KEEP_ALIVE) { let arg = arg.split_once('=').map(|(_, arg)| arg); - if arg.is_some_and(|a| a == "0") { - self.disable_nvme_keep_alive = false; + if arg.is_some_and(|a| a != "0") { + self.disable_nvme_keep_alive = true; } } else if arg.starts_with(VTL2_GPA_POOL_NUMA) { if let Some((_, arg)) = arg.split_once('=') { diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 511639253f..0ac9610f83 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -444,16 +444,17 @@ impl LoadedVm { capabilities_flags, } = message; - // If the host provided timeout hint is >= uint16::max - // seconds, we treat that as a signal from the host that no + // If the host provided timeout hint is >= 59 seconds, + // we treat that as a signal from the host that no // timeout duration was set. We instead limit servicing to - // 200s in that case. - let timeout_hint = if timeout_hint >= Duration::from_secs(u16::MAX as u64) { + // 59s in that case. + let timeout_hint = if timeout_hint >= Duration::from_secs(59) { tracing::info!( CVM_ALLOWED, - "host provided UINT16_MAX timeout hint, defaulting to 200s" + host_timeout_hint_ms = timeout_hint.as_millis() as u64, + "host provided timeout hint > 59s, defaulting to 59s" ); - Duration::from_secs(200) + Duration::from_secs(59) } else { timeout_hint }; diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index 3fb4892297..bf85c41aea 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -557,6 +557,7 @@ impl NvmeDriverManagerWorker { } NvmeDriverRequest::Shutdown(rpc) => { rpc.handle(async |(_span, options)| { + tracing::info!(pci_id = %self.pci_id, do_not_reset = %options.do_not_reset, skip_device_shutdown = %options.skip_device_shutdown, "nvme device manager worker shutdown called"); // Driver may be `None` here if there was a failure during driver creation. // In that case, we just skip the shutdown rather than panic. match self.driver.take() { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 25bb4472df..cc5c2e2916 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -831,6 +831,7 @@ impl NvmeDriver { tracing::info!( id = a.qid, pending_commands_count = a.handler_data.pending_cmds.commands.len(), + base_pfn = a.base_pfn, ?pci_id, "restoring admin queue", ); @@ -914,9 +915,10 @@ impl NvmeDriver { .io .iter() .map(|io_state| format!( - "{{qid={}, pending_commands_count={}}}", + "{{qid={}, pending_commands_count={}, base_pfn={}}}", io_state.queue_data.qid, - io_state.queue_data.handler_data.pending_cmds.commands.len() + io_state.queue_data.handler_data.pending_cmds.commands.len(), + io_state.queue_data.base_pfn, )) .collect::>() .join(", "), @@ -1270,7 +1272,7 @@ async fn handle_asynchronous_events( impl Drop for NvmeDriver { fn drop(&mut self) { - tracing::trace!(pci_id = ?self.device_id, ka = self.nvme_keepalive, task = self.task.is_some(), "dropping nvme driver"); + tracing::info!(pci_id = ?self.device_id, ka = self.nvme_keepalive, task = self.task.is_some(), "dropping nvme driver"); if self.task.is_some() { // Do not reset NVMe device when nvme_keepalive is requested. tracing::debug!(nvme_keepalive = self.nvme_keepalive, pci_id = ?self.device_id, "dropping nvme driver"); @@ -1308,32 +1310,29 @@ impl AsyncRun for DriverWorkerTask { stop: &mut task_control::StopTask<'_>, state: &mut WorkerState, ) -> Result<(), task_control::Cancelled> { - let r = stop - .until_stopped(async { - loop { - match self.recv.next().await { - Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { - rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) - .await - } - Some(NvmeWorkerRequest::Save(rpc)) => { - rpc.handle(async |span| { - let child_span = tracing::info_span!( - parent: &span, - "nvme_worker_save", - pci_id = %self.device.id() - ); - self.save(state).instrument(child_span).await - }) - .await - } - None => break, - } + loop { + let cmd = stop.until_stopped(self.recv.next()).await?; + match cmd { + Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { + rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) + .await } - }) - .await; - tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting"); - r + Some(NvmeWorkerRequest::Save(rpc)) => { + rpc.handle(async |span| { + let child_span = tracing::info_span!( + parent: &span, + "nvme_worker_save", + pci_id = %self.device.id() + ); + self.save(state).instrument(child_span).await + }) + .await + } + None => break, + } + } + tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting cleanly"); + Ok(()) } } @@ -1398,7 +1397,7 @@ impl DriverWorkerTask { } async fn create_io_issuer(&mut self, state: &mut WorkerState, cpu: u32) { - tracing::debug!(cpu, pci_id = ?self.device.id(), "issuer request"); + tracing::info!(cpu, pci_id = ?self.device.id(), "create io issuer request"); if self.io_issuers.per_cpu[cpu as usize].get().is_some() { return; } @@ -1496,7 +1495,7 @@ impl DriverWorkerTask { let iv = qid - 1; self.next_ioq_id += 1; - tracing::debug!(cpu, qid, iv, pci_id = ?self.device.id(), "creating io queue"); + tracing::info!(cpu, qid, iv, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: creating io queue"); let interrupt = self .device @@ -1516,7 +1515,7 @@ impl DriverWorkerTask { qid, cpu, pci_id = ?self.device.id(), - "created io queue in SelfDrained state" + "create_nvme_io_queue_fn: created io queue in SelfDrained state" ); } @@ -1533,6 +1532,7 @@ impl DriverWorkerTask { drain_after_restore, ) .map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?; + tracing::info!(cpu, qid, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: created the internal queue pair object."); assert_eq!(queue.sq_entries(), queue.cq_entries()); state.qsize = queue.sq_entries(); @@ -1599,6 +1599,7 @@ impl DriverWorkerTask { }; if let Err(err) = r.await { + tracing::info!(cpu, qid, pci_id = ?self.device.id(), created_completion_queue, error = ?err, "create_nvme_io_queue_fn: failed to create IO queue, tearing down and deleting internal resources."); if created_completion_queue { if let Err(err) = admin .issue_raw(spec::Command { @@ -1619,6 +1620,7 @@ impl DriverWorkerTask { return Err(DeviceError::Other(err)); } + tracing::info!(cpu, qid, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: successfuly created io queue."); Ok(IoIssuer { issuer: io_queue.queue.issuer().clone(), cpu, @@ -1703,6 +1705,7 @@ impl DriverWorkerTask { pci_id = ?self.device.id(), id = admin_state.qid, pending_commands_count = admin_state.handler_data.pending_cmds.commands.len(), + pfn = admin_state.base_pfn, "saved admin queue", ); Some(admin_state) @@ -1758,9 +1761,10 @@ impl DriverWorkerTask { state = io .iter() .map(|io_state| format!( - "{{qid={}, pending_commands_count={}}}", + "{{qid={}, pending_commands_count={}, base_pfn={}}}", io_state.queue_data.qid, - io_state.queue_data.handler_data.pending_cmds.commands.len() + io_state.queue_data.handler_data.pending_cmds.commands.len(), + io_state.queue_data.base_pfn )) .collect::>() .join(", "), diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 73fc432a53..5b6eb831b2 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -93,11 +93,12 @@ impl PendingCommands { const MAX_CIDS: usize = 1 << Self::CID_KEY_BITS; const CID_SEQ_OFFSET: Wrapping = Wrapping(1 << Self::CID_KEY_BITS); - fn new(qid: u16) -> Self { + fn new(qid: u16, device_id: String) -> Self { Self { commands: Slab::new(), next_cid_high_bits: Wrapping(0), qid, + device_id, } } @@ -124,7 +125,19 @@ impl PendingCommands { entry.insert(PendingCommand { command: *command, respond, - submitted_at: (self.qid == 0).then(Instant::now), + submitted_at: (self.qid == 0).then(|| { + tracing::info!( + pci_id = %self.device_id, + ?cid, + opcode = ?command.cdw0.opcode(), + opname = ?spec::AdminOpcode(command.cdw0.opcode()), + nsid = ?command.nsid, + cdw10 = command.cdw10, + cdw11 = command.cdw11, + "inserted admin command", + ); + Instant::now() + }), }); } @@ -140,6 +153,19 @@ impl PendingCommands { self.qid, command.command.cdw0.opcode(), ); + if let Some(submitted_at) = command.submitted_at { + tracing::info!( + pci_id = %self.device_id, + ?cid, + opcode = ?command.command.cdw0.opcode(), + opname = ?spec::AdminOpcode(command.command.cdw0.opcode()), + nsid = ?command.command.nsid, + cdw10 = command.command.cdw10, + cdw11 = command.command.cdw11, + elapsed = ?submitted_at.elapsed(), + "completed admin command", + ); + } command.respond } @@ -161,7 +187,11 @@ impl PendingCommands { } /// Restore pending commands from the saved state. - pub fn restore(saved_state: &PendingCommandsSavedState, qid: u16) -> anyhow::Result { + pub fn restore( + saved_state: &PendingCommandsSavedState, + qid: u16, + device_id: String, + ) -> anyhow::Result { let PendingCommandsSavedState { commands, next_cid_high_bits, @@ -188,6 +218,7 @@ impl PendingCommands { .collect::>(), next_cid_high_bits: Wrapping(*next_cid_high_bits), qid, + device_id, }) } } @@ -508,7 +539,7 @@ impl QueuePair { QueueHandler { sq: SubmissionQueue::new(qid, sq_entries, sq_mem_block), cq: CompletionQueue::new(qid, cq_entries, cq_mem_block), - commands: PendingCommands::new(qid), + commands: PendingCommands::new(qid, device_id.into()), stats: Default::default(), drain_after_restore, aer_handler, @@ -947,6 +978,7 @@ struct PendingCommands { #[inspect(hex)] next_cid_high_bits: Wrapping, qid: u16, + device_id: String, } #[derive(Inspect)] @@ -1360,7 +1392,7 @@ impl QueueHandler { Ok(Self { sq: SubmissionQueue::restore(sq_mem_block, sq_state)?, cq: CompletionQueue::restore(cq_mem_block, cq_state)?, - commands: PendingCommands::restore(pending_cmds, sq_state.sqid)?, + commands: PendingCommands::restore(pending_cmds, sq_state.sqid, device_id.into())?, stats: Default::default(), // Only drain pending commands for I/O queues. // Admin queue is expected to have pending Async Event requests. diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e3c74aa96f..7d724b40b9 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -147,9 +147,7 @@ async fn servicing_keepalive_no_device( ) -> anyhow::Result<()> { let flags = config.default_servicing_flags(); openhcl_servicing_core( - config.with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ), + config.with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"), igvm_file, flags, DEFAULT_SERVICING_COUNT, @@ -198,9 +196,7 @@ async fn servicing_keepalive_with_device( let flags = config.default_servicing_flags(); openhcl_servicing_core( config - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .with_boot_device_type(petri::BootDeviceType::ScsiViaNvme) .with_vmbus_redirect(true), // Need this to attach the NVMe device igvm_file, @@ -1219,9 +1215,7 @@ async fn create_keepalive_test_config_default( config .with_vmbus_redirect(true) - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .modify_backend(move |b| { b.with_custom_config(|c| { // Add a fault controller to test the nvme controller functionality @@ -1309,9 +1303,9 @@ async fn create_keepalive_test_config_custom( ) -> Result<(PetriVm, PipetteClient), anyhow::Error> { const NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - let mut builder = config.with_vmbus_redirect(true).with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ); + let mut builder = config + .with_vmbus_redirect(true) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"); for cmdline in extra_cmdlines { builder = builder.with_openhcl_command_line(cmdline); diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs index b43f0ec855..593e78dcc6 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs @@ -193,7 +193,7 @@ async fn nvme_relay_explicit_private_pool( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512", expected_props: Some(ExpectedNvmeDeviceProperties { save_restore_supported: true, qsize: 256, // private pool should allow contiguous allocations. @@ -218,7 +218,7 @@ async fn nvme_relay_heuristic_debug_16vp_768mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024), @@ -246,7 +246,7 @@ async fn nvme_relay_heuristic_release_16vp_256mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(256 * 1024 * 1024), @@ -277,7 +277,7 @@ async fn nvme_relay_heuristic_release_32vp_500mb_very_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(500 * 1024 * 1024), @@ -307,7 +307,7 @@ async fn nvme_relay_32vp_768mb_very_heavy( ..Default::default() }), NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0 OPENHCL_ENABLE_VTL2_GPA_POOL=10240", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=10240", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024), From 0a9f8715a1e5d5d4b0d840dd205ab6d74b696453 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 14:46:36 -0700 Subject: [PATCH 02/13] Should enable keepalive but only for ASAP devices --- .../src/nvme_manager/manager.rs | 41 ++++++++++++++++--- .../underhill_core/src/nvme_manager/mod.rs | 17 ++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 1b59109fec..d901748574 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -5,6 +5,7 @@ use crate::nvme_manager::CreateNvmeDriver; use crate::nvme_manager::device::NvmeDriverManager; use crate::nvme_manager::device::NvmeDriverManagerClient; use crate::nvme_manager::device::NvmeDriverShutdownOptions; +use crate::nvme_manager::is_nvme_keepalive_compatible; use crate::nvme_manager::save_restore::NvmeManagerSavedState; use crate::nvme_manager::save_restore::NvmeSavedDiskConfig; use crate::servicing::NvmeSavedState; @@ -301,12 +302,27 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { + // nvme_keepalive is received from host but it is only valid + // when memory pool allocator supports save/restore. Further, + // as a partial mitigation for known incompatibilities between + // keepalive and NVMe Direct v2 devices, we only honor + // keepalive for ASAP devices (identified by the VPCI + // instance GUID containing `c05b`). + let host_requested_keepalive = + nvme_keepalive && self.context.save_restore_supported; + let device_keepalive = + host_requested_keepalive && is_nvme_keepalive_compatible(&pci_id); + if host_requested_keepalive && !device_keepalive { + tracing::info!( + %pci_id, + "disabling nvme keepalive for non-ASAP device; \ + falling back to reset-on-servicing" + ); + } driver .shutdown(NvmeDriverShutdownOptions { - // nvme_keepalive is received from host but it is only valid - // when memory pool allocator supports save/restore. - do_not_reset: nvme_keepalive && self.context.save_restore_supported, - skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported, + do_not_reset: device_keepalive, + skip_device_shutdown: device_keepalive, }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -419,12 +435,27 @@ impl NvmeManagerWorker { /// Saves NVMe device's states into buffer during servicing. pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); + // Only save state for devices known to be keepalive-compatible. + // Non-ASAP (NVMe Direct v2) devices will be reset on servicing, so + // persisting their state would be a waste and could conflict with + // the reset path on restore. let mut devices_to_save: HashMap = self .context .devices .write() .iter() - .map(|(pci_id, driver)| (pci_id.clone(), driver.client().clone())) + .filter_map(|(pci_id, driver)| { + if is_nvme_keepalive_compatible(pci_id) { + Some((pci_id.clone(), driver.client().clone())) + } else { + tracing::info!( + %pci_id, + "skipping save of non-ASAP nvme device; \ + keepalive disabled for this device" + ); + None + } + }) .collect(); for (pci_id, client) in devices_to_save.iter_mut() { nvme_disks.push(NvmeSavedDiskConfig { diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index cc8977de45..9ac604f0a9 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -111,3 +111,20 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>, ) -> Result, NvmeSpawnerError>; } + +/// Returns whether NVMe keepalive (servicing without device reset) is +/// known to be compatible with the device identified by this VPCI +/// instance ID. +/// +/// NVMe keepalive currently has known incompatibilities with NVMe Direct +/// v2 devices. As a partial mitigation, we only enable keepalive for +/// ASAP devices, which can be identified by the substring `c05b` in the +/// VPCI instance GUID. All other devices (NVMe Direct v2) fall back to +/// the legacy reset-on-servicing path even when the host-level keepalive +/// flag is enabled. +/// +/// This heuristic is intentionally fragile and is intended only as a +/// temporary mitigation while the real compatibility work is in flight. +pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { + pci_id.to_ascii_lowercase().starts_with("c05b") +} From aec5033ad44ed1f6e58b107833c038b0a1894569 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 15:54:07 -0700 Subject: [PATCH 03/13] I think this is in better shape --- .../src/nvme_manager/manager.rs | 32 +++++-------------- .../underhill_core/src/nvme_manager/mod.rs | 15 ++------- 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index d901748574..5ef43eb555 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -302,27 +302,14 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { - // nvme_keepalive is received from host but it is only valid - // when memory pool allocator supports save/restore. Further, - // as a partial mitigation for known incompatibilities between - // keepalive and NVMe Direct v2 devices, we only honor - // keepalive for ASAP devices (identified by the VPCI - // instance GUID containing `c05b`). - let host_requested_keepalive = - nvme_keepalive && self.context.save_restore_supported; - let device_keepalive = - host_requested_keepalive && is_nvme_keepalive_compatible(&pci_id); - if host_requested_keepalive && !device_keepalive { - tracing::info!( - %pci_id, - "disabling nvme keepalive for non-ASAP device; \ - falling back to reset-on-servicing" - ); - } driver .shutdown(NvmeDriverShutdownOptions { - do_not_reset: device_keepalive, - skip_device_shutdown: device_keepalive, + do_not_reset: nvme_keepalive + && self.context.save_restore_supported + && is_nvme_keepalive_compatible(&pci_id), + skip_device_shutdown: nvme_keepalive + && self.context.save_restore_supported + && !is_nvme_keepalive_compatible(&pci_id), }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -375,7 +362,7 @@ impl NvmeManagerWorker { &context.driver_source, &pci_id, context.vp_count, - context.save_restore_supported, + context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), None, // No device yet, context.nvme_driver_spawner.clone(), )?; @@ -436,9 +423,6 @@ impl NvmeManagerWorker { pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); // Only save state for devices known to be keepalive-compatible. - // Non-ASAP (NVMe Direct v2) devices will be reset on servicing, so - // persisting their state would be a waste and could conflict with - // the reset path on restore. let mut devices_to_save: HashMap = self .context .devices @@ -450,7 +434,7 @@ impl NvmeManagerWorker { } else { tracing::info!( %pci_id, - "skipping save of non-ASAP nvme device; \ + "skipping save of nvme device; \ keepalive disabled for this device" ); None diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 9ac604f0a9..33b300993d 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -112,19 +112,8 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { ) -> Result, NvmeSpawnerError>; } -/// Returns whether NVMe keepalive (servicing without device reset) is -/// known to be compatible with the device identified by this VPCI -/// instance ID. -/// -/// NVMe keepalive currently has known incompatibilities with NVMe Direct -/// v2 devices. As a partial mitigation, we only enable keepalive for -/// ASAP devices, which can be identified by the substring `c05b` in the -/// VPCI instance GUID. All other devices (NVMe Direct v2) fall back to -/// the legacy reset-on-servicing path even when the host-level keepalive -/// flag is enabled. -/// -/// This heuristic is intentionally fragile and is intended only as a -/// temporary mitigation while the real compatibility work is in flight. +/// Returns whether the given PCI ID corresponds to an NVMe device that is +/// compatible with the keepalive. pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id.to_ascii_lowercase().starts_with("c05b") } From 1c8bc557cbc8f1e03b27f64c02c81d866e2bc3a8 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 16:05:17 -0700 Subject: [PATCH 04/13] Don't start with keepalive either --- openhcl/underhill_core/src/nvme_manager/manager.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 5ef43eb555..7934dacb64 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -304,6 +304,8 @@ impl NvmeManagerWorker { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { driver .shutdown(NvmeDriverShutdownOptions { + // nvme_keepalive is received from host but it is only valid + // when memory pool allocator supports save/restore. do_not_reset: nvme_keepalive && self.context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), From 30b5bad61e623443e5c7733db402d10b049962d0 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Fri, 1 May 2026 16:07:08 -0700 Subject: [PATCH 05/13] Setting up the flags correctly --- openhcl/underhill_core/src/nvme_manager/manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 7934dacb64..7b04d9e3bd 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -311,7 +311,7 @@ impl NvmeManagerWorker { && is_nvme_keepalive_compatible(&pci_id), skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported - && !is_nvme_keepalive_compatible(&pci_id), + && is_nvme_keepalive_compatible(&pci_id), }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) From 6076ef667d49ebbb9c75f34efa44e12b0dc55ce1 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 11:54:59 -0700 Subject: [PATCH 06/13] Added a test to verify that only ASAP devices are being reset --- .../tests/multiarch/openhcl_servicing.rs | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e3c74aa96f..6072e0907c 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1169,6 +1169,191 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } +/// Verifies the per-device NVMe keepalive gate. +/// +/// Two NVMe controllers are attached to a single VM: +/// * An "ASAP" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// makes its derived pci_id start with `c05b:` (matching the +/// keepalive-compatible heuristic). Keepalive must be honored for +/// this device — its controller is not reset across servicing, so +/// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. +/// The fault panics if the opcode is observed. +/// * An "NVMe Direct v2" device whose pci_id does not start with +/// `c05b:`. Keepalive must be downgraded to reset-on-servicing for +/// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after +/// servicing. The fault verifies (via a oneshot) that the opcode is +/// observed. +/// +/// Keepalive is enabled VM-wide; the gate is expected to apply per +/// device based on its pci_id. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_per_device_gate( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + + // VPCI instance IDs. The pci_id seen by the NVMe driver is derived + // as `format!("{:04x}:00:00.0", instance_id.data2)` (see + // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` + // controls the prefix of the pci_id. For ASAP we set data2 = 0xc05b + // so its pci_id is `c05b:00:00.0`. + const ASAP_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const ND2_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + + const ASAP_NSID: u32 = KEEPALIVE_VTL2_NSID; + const ND2_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const ASAP_LUN: u32 = VTL0_NVME_LUN; + const ND2_LUN: u32 = VTL0_NVME_LUN + 1; + + // Two independent fault start cells — one per device. + let mut asap_fault_updater = CellUpdater::new(false); + let mut nd2_fault_updater = CellUpdater::new(false); + + let (nd2_create_seen_send, nd2_create_seen_recv) = mesh::oneshot::<()>(); + + // ASAP (c05b) device: keepalive must be honored — fail loudly if any + // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. + let asap_fault_config = FaultConfiguration::new(asap_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Panic( + "ASAP (c05b) device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + keepalive should have been honored for this device but the controller \ + was reset." + .to_string(), + ), + ), + ); + + // NVMe Direct v2 (non-c05b) device: keepalive must be downgraded — + // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. + let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Verify(Some(nd2_create_seen_send)), + ), + ); + + let scsi_instance = Guid::new_random(); + + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + ) + .modify_backend(move |b| { + b.with_custom_config(move |c| { + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: ASAP_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: ASAP_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: asap_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: ND2_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: ND2_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: nd2_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + }) + }) + .add_vtl2_storage_controller( + Vtl2StorageControllerBuilder::new(ControllerType::Scsi) + .with_instance_id(scsi_instance) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(ASAP_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + ASAP_NVME_INSTANCE, + ASAP_NSID, + )), + ) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(ND2_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + ND2_NVME_INSTANCE, + ND2_NSID, + )), + ) + .build(), + ) + .run() + .await?; + + agent.ping().await?; + + // Arm both faults BEFORE servicing so that the post-servicing + // CREATE_IO_COMPLETION_QUEUE is what each fault matches. + asap_fault_updater.set(true).await; + nd2_fault_updater.set(true).await; + + vm.restart_openhcl(igvm_file.clone(), flags).await?; + + agent.ping().await?; + + // The non-c05b device must issue CREATE_IO_COMPLETION_QUEUE after + // servicing because its keepalive was downgraded to reset. + CancelContext::new() + .with_timeout(Duration::from_secs(60)) + .until_cancelled(nd2_create_seen_recv) + .await + .expect( + "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ + servicing — the per-device keepalive gate did not downgrade keepalive for this \ + device.", + ) + .expect("CREATE_IO_COMPLETION_QUEUE verification on non-c05b device failed"); + + // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the + // panic fault would have crashed the VM and failed this test. We + // disarm the faults defensively before tearing down. + asap_fault_updater.set(false).await; + nd2_fault_updater.set(false).await; + + Ok(()) +} + async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration, From 1e19679ae91be3471fb92e09a1e0fba7a53b3f7c Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:01:57 -0700 Subject: [PATCH 07/13] Update comments --- .../vmm_tests/tests/tests/multiarch/openhcl_servicing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 6072e0907c..2696f462b3 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1178,7 +1178,7 @@ async fn servicing_keepalive_slow_create_io_queue( /// this device — its controller is not reset across servicing, so /// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. /// The fault panics if the opcode is observed. -/// * An "NVMe Direct v2" device whose pci_id does not start with +/// * A device whose pci_id does not start with /// `c05b:`. Keepalive must be downgraded to reset-on-servicing for /// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after /// servicing. The fault verifies (via a oneshot) that the opcode is @@ -1230,7 +1230,7 @@ async fn servicing_keepalive_per_device_gate( ), ); - // NVMe Direct v2 (non-c05b) device: keepalive must be downgraded — + // Non c05b device: keepalive must be downgraded — // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) .with_admin_queue_fault( From fc20c570ad5097794b540506d20400f35b9d7a8d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:17:55 -0700 Subject: [PATCH 08/13] Added comment fix and removed magic numbers out of the function definition --- openhcl/underhill_core/src/nvme_manager/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 33b300993d..b0adf280e0 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,6 +67,10 @@ pub struct NamespaceError { source: NvmeSpawnerError, } +/// PCI vendor-ID prefix for NVMe devices that support keepalive, including +/// the expected `:` separator in the `dddd:bb:dd.f` PCI ID format. +const KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX: &str = "c05b:"; + #[derive(Debug, Error)] pub enum NvmeSpawnerError { #[error("failed to initialize vfio device")] @@ -114,6 +118,9 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. +/// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { - pci_id.to_ascii_lowercase().starts_with("c05b") + pci_id + .to_ascii_lowercase() + .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) } From 4c15d5d586a1539036e95e77e7ea6d0faf0fcb43 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 12:32:33 -0700 Subject: [PATCH 09/13] More variable name cleanup --- .../underhill_core/src/nvme_manager/mod.rs | 2 +- .../tests/multiarch/openhcl_servicing.rs | 66 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index b0adf280e0..d2540d7d76 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -119,7 +119,7 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. /// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary -pub fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { +pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id .to_ascii_lowercase() .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 2696f462b3..3c654d51dc 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -1172,7 +1172,7 @@ async fn servicing_keepalive_slow_create_io_queue( /// Verifies the per-device NVMe keepalive gate. /// /// Two NVMe controllers are attached to a single VM: -/// * An "ASAP" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// * A "c05b" device whose VPCI instance ID has `data2 = 0xc05b`, which /// makes its derived pci_id start with `c05b:` (matching the /// keepalive-compatible heuristic). Keepalive must be honored for /// this device — its controller is not reset across servicing, so @@ -1197,32 +1197,32 @@ async fn servicing_keepalive_per_device_gate( // VPCI instance IDs. The pci_id seen by the NVMe driver is derived // as `format!("{:04x}:00:00.0", instance_id.data2)` (see // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` - // controls the prefix of the pci_id. For ASAP we set data2 = 0xc05b + // controls the prefix of the pci_id. For the device with keepalive we set data2 = 0xc05b // so its pci_id is `c05b:00:00.0`. - const ASAP_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); - const ND2_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + const WITH_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const NO_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - const ASAP_NSID: u32 = KEEPALIVE_VTL2_NSID; - const ND2_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; - const ASAP_LUN: u32 = VTL0_NVME_LUN; - const ND2_LUN: u32 = VTL0_NVME_LUN + 1; + const WITH_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID; + const NO_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const WITH_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN; + const NO_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN + 1; // Two independent fault start cells — one per device. - let mut asap_fault_updater = CellUpdater::new(false); - let mut nd2_fault_updater = CellUpdater::new(false); + let mut with_keepalive_fault_updater = CellUpdater::new(false); + let mut no_keepalive_fault_updater = CellUpdater::new(false); - let (nd2_create_seen_send, nd2_create_seen_recv) = mesh::oneshot::<()>(); + let (no_keepalive_create_seen_send, no_keepalive_create_seen_recv) = mesh::oneshot::<()>(); - // ASAP (c05b) device: keepalive must be honored — fail loudly if any + // c05b device: keepalive must be honored — fail loudly if any // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. - let asap_fault_config = FaultConfiguration::new(asap_fault_updater.cell()) + let with_keepalive_fault_config = FaultConfiguration::new(with_keepalive_fault_updater.cell()) .with_admin_queue_fault( AdminQueueFaultConfig::new().with_submission_queue_fault( CommandMatchBuilder::new() .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) .build(), AdminQueueFaultBehavior::Panic( - "ASAP (c05b) device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + "c05b device received CREATE_IO_COMPLETION_QUEUE after servicing — \ keepalive should have been honored for this device but the controller \ was reset." .to_string(), @@ -1232,13 +1232,13 @@ async fn servicing_keepalive_per_device_gate( // Non c05b device: keepalive must be downgraded — // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. - let nd2_fault_config = FaultConfiguration::new(nd2_fault_updater.cell()) + let no_keepalive_fault_config = FaultConfiguration::new(no_keepalive_fault_updater.cell()) .with_admin_queue_fault( AdminQueueFaultConfig::new().with_submission_queue_fault( CommandMatchBuilder::new() .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) .build(), - AdminQueueFaultBehavior::Verify(Some(nd2_create_seen_send)), + AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), ), ); @@ -1253,13 +1253,13 @@ async fn servicing_keepalive_per_device_gate( b.with_custom_config(move |c| { c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: ASAP_NVME_INSTANCE, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: ASAP_NSID, + nsid: WITH_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1267,20 +1267,20 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: asap_fault_config, + fault_config: with_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), }); c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: ND2_NVME_INSTANCE, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: ND2_NSID, + nsid: NO_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1288,7 +1288,7 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: nd2_fault_config, + fault_config: no_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), @@ -1300,20 +1300,20 @@ async fn servicing_keepalive_per_device_gate( .with_instance_id(scsi_instance) .add_lun( Vtl2LunBuilder::disk() - .with_location(ASAP_LUN) + .with_location(WITH_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - ASAP_NVME_INSTANCE, - ASAP_NSID, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, )), ) .add_lun( Vtl2LunBuilder::disk() - .with_location(ND2_LUN) + .with_location(NO_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - ND2_NVME_INSTANCE, - ND2_NSID, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, )), ) .build(), @@ -1325,8 +1325,8 @@ async fn servicing_keepalive_per_device_gate( // Arm both faults BEFORE servicing so that the post-servicing // CREATE_IO_COMPLETION_QUEUE is what each fault matches. - asap_fault_updater.set(true).await; - nd2_fault_updater.set(true).await; + with_keepalive_fault_updater.set(true).await; + no_keepalive_fault_updater.set(true).await; vm.restart_openhcl(igvm_file.clone(), flags).await?; @@ -1336,7 +1336,7 @@ async fn servicing_keepalive_per_device_gate( // servicing because its keepalive was downgraded to reset. CancelContext::new() .with_timeout(Duration::from_secs(60)) - .until_cancelled(nd2_create_seen_recv) + .until_cancelled(no_keepalive_create_seen_recv) .await .expect( "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ @@ -1348,8 +1348,8 @@ async fn servicing_keepalive_per_device_gate( // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the // panic fault would have crashed the VM and failed this test. We // disarm the faults defensively before tearing down. - asap_fault_updater.set(false).await; - nd2_fault_updater.set(false).await; + with_keepalive_fault_updater.set(false).await; + no_keepalive_fault_updater.set(false).await; Ok(()) } From 4c9456c82e152ccfe46312caca93e659d3c30b3d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 13:22:00 -0700 Subject: [PATCH 10/13] Fixed comments --- openhcl/underhill_core/src/nvme_manager/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index d2540d7d76..0d8b2a620c 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,9 +67,9 @@ pub struct NamespaceError { source: NvmeSpawnerError, } -/// PCI vendor-ID prefix for NVMe devices that support keepalive, including -/// the expected `:` separator in the `dddd:bb:dd.f` PCI ID format. -const KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX: &str = "c05b:"; +/// Device prefix for nvme devices that are compatibble with keepalive. +/// the "dddd:" in the `dddd:bb:dd.f` PCI ID format. +const KEEPALIVE_COMPATIBLE_DEVICE_PREFIX: &str = "c05b:"; #[derive(Debug, Error)] pub enum NvmeSpawnerError { @@ -118,9 +118,9 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { /// Returns whether the given PCI ID corresponds to an NVMe device that is /// compatible with the keepalive. -/// DEV_NOTE: This is a heuristic based on the PCI vendor ID, which is not ideal but is necessary +/// DEV_NOTE: This is a weak heuristic based approach which is not ideal but is necessary pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { pci_id .to_ascii_lowercase() - .starts_with(KEEPALIVE_COMPATIBLE_PCI_VENDOR_PREFIX) + .starts_with(KEEPALIVE_COMPATIBLE_DEVICE_PREFIX) } From dbab2149d7599cd394b3f18c11bdc64beecb83ab Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 13:30:06 -0700 Subject: [PATCH 11/13] Will no longer restore devices that are not keepalive comptible. Moreover, it will tear down that device state and start again ... at least it should --- openhcl/underhill_core/src/nvme_manager/manager.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 7b04d9e3bd..ee14680a94 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -465,6 +465,9 @@ impl NvmeManagerWorker { let mut restored_devices: HashMap = HashMap::new(); for disk in &saved_state.nvme_disks { + // Only restore disks that are known to be keepalive compatible. + // This should gracefully tear down state for non-keepalive devices + // before using the device. let pci_id = disk.pci_id.clone(); let nvme_driver = self .context @@ -473,7 +476,7 @@ impl NvmeManagerWorker { &self.context.driver_source, &pci_id, saved_state.cpu_count, - save_restore_supported, + save_restore_supported && is_nvme_keepalive_compatible(&pci_id), Some(&disk.driver_state), ) .await?; From 6412280ee8d66b6b19c3dc53ef346f7ab73adf21 Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 16:55:20 -0700 Subject: [PATCH 12/13] Now using vendor and device ID information instead of using pci ID to figure out keepalive info --- .../underhill_core/src/nvme_manager/device.rs | 10 +++ .../src/nvme_manager/manager.rs | 37 +++++++--- .../underhill_core/src/nvme_manager/mod.rs | 55 ++++++++++++--- .../storage/nvme_resources/src/fault.rs | 68 +++++++++++++++++++ vm/devices/storage/nvme_test/src/pci.rs | 15 +++- .../tests/multiarch/openhcl_servicing.rs | 30 ++++---- 6 files changed, 184 insertions(+), 31 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index 3fb4892297..e9bda747b1 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -334,6 +334,9 @@ enum NvmeDriverRequest { pub struct NvmeDriverManager { task: Task<()>, pci_id: String, + /// Whether the underlying device is compatible with NVMe keepalive + /// (cached at construction time so callers don't have to re-read sysfs). + keepalive_compatible: bool, client: NvmeDriverManagerClient, } @@ -358,12 +361,18 @@ impl NvmeDriverManager { &self.client } + /// Returns whether the underlying device is compatible with NVMe keepalive. + pub fn keepalive_compatible(&self) -> bool { + self.keepalive_compatible + } + /// Creates the [`NvmeDriverManager`]. pub fn new( driver_source: &VmTaskDriverSource, pci_id: &str, vp_count: u32, save_restore_supported: bool, + keepalive_compatible: bool, device: Option>, nvme_driver_spawner: Arc, ) -> anyhow::Result { @@ -382,6 +391,7 @@ impl NvmeDriverManager { Ok(Self { task, pci_id: pci_id.into(), + keepalive_compatible, client: NvmeDriverManagerClient { pci_id: pci_id.into(), sender: send, diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index ee14680a94..13f86430a7 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -302,16 +302,17 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { + let keepalive_compatible = driver.keepalive_compatible(); driver .shutdown(NvmeDriverShutdownOptions { // nvme_keepalive is received from host but it is only valid // when memory pool allocator supports save/restore. do_not_reset: nvme_keepalive && self.context.save_restore_supported - && is_nvme_keepalive_compatible(&pci_id), + && keepalive_compatible, skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported - && is_nvme_keepalive_compatible(&pci_id), + && keepalive_compatible, }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -360,11 +361,17 @@ impl NvmeManagerWorker { match guard.entry(pci_id.clone()) { hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. hash_map::Entry::Vacant(entry) => { + // Read the PCI vendor/device IDs once and cache the + // resulting keepalive compatibility flag on the + // `NvmeDriverManager` for later save/shutdown queries. + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let driver = NvmeDriverManager::new( &context.driver_source, &pci_id, context.vp_count, - context.save_restore_supported && is_nvme_keepalive_compatible(&pci_id), + context.save_restore_supported && keepalive_compatible, + keepalive_compatible, None, // No device yet, context.nvme_driver_spawner.clone(), )?; @@ -431,7 +438,7 @@ impl NvmeManagerWorker { .write() .iter() .filter_map(|(pci_id, driver)| { - if is_nvme_keepalive_compatible(pci_id) { + if driver.keepalive_compatible() { Some((pci_id.clone(), driver.client().clone())) } else { tracing::info!( @@ -469,6 +476,11 @@ impl NvmeManagerWorker { // This should gracefully tear down state for non-keepalive devices // before using the device. let pci_id = disk.pci_id.clone(); + + // Read the PCI vendor/device IDs once and cache the resulting + // keepalive compatibility flag on the `NvmeDriverManager`. + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let nvme_driver = self .context .nvme_driver_spawner @@ -476,7 +488,7 @@ impl NvmeManagerWorker { &self.context.driver_source, &pci_id, saved_state.cpu_count, - save_restore_supported && is_nvme_keepalive_compatible(&pci_id), + save_restore_supported && keepalive_compatible, Some(&disk.driver_state), ) .await?; @@ -488,6 +500,7 @@ impl NvmeManagerWorker { &pci_id, self.context.vp_count, true, // save_restore_supported is always `true` when restoring. + keepalive_compatible, Some(nvme_driver), self.context.nvme_driver_spawner.clone(), )?, @@ -1088,9 +1101,16 @@ mod tests { )); // Create a driver manager - let driver_manager = - NvmeDriverManager::new(&driver_source, "0000:00:04.0", 4, false, None, spawner) - .unwrap(); + let driver_manager = NvmeDriverManager::new( + &driver_source, + "0000:00:04.0", + 4, + false, + false, + None, + spawner, + ) + .unwrap(); let client = driver_manager.client().clone(); @@ -1174,6 +1194,7 @@ mod tests { "0000:00:05.0", 4, true, // save_restore_supported + true, None, spawner, ) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 0d8b2a620c..09bd259ad2 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,9 +67,13 @@ pub struct NamespaceError { source: NvmeSpawnerError, } -/// Device prefix for nvme devices that are compatibble with keepalive. -/// the "dddd:" in the `dddd:bb:dd.f` PCI ID format. -const KEEPALIVE_COMPATIBLE_DEVICE_PREFIX: &str = "c05b:"; +/// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`), +/// for NVMe devices that are compatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; + +/// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), +/// for NVMe devices that are compatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; #[derive(Debug, Error)] pub enum NvmeSpawnerError { @@ -116,11 +120,44 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { ) -> Result, NvmeSpawnerError>; } -/// Returns whether the given PCI ID corresponds to an NVMe device that is -/// compatible with the keepalive. -/// DEV_NOTE: This is a weak heuristic based approach which is not ideal but is necessary +/// Returns whether the given PCI device is compatible with NVMe keepalive. pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { - pci_id - .to_ascii_lowercase() - .starts_with(KEEPALIVE_COMPATIBLE_DEVICE_PREFIX) + match read_pci_vendor_device_ids(&pci_id) { + Ok((vendor_id, device_id)) => { + vendor_id != KEEPALIVE_INCOMPATIBLE_VENDOR_ID + || device_id != KEEPALIVE_INCOMPATIBLE_DEVICE_ID + } + Err(err) => { + tracing::warn!( + pci_id = %pci_id, + error = err.as_ref() as &dyn std::error::Error, + "failed to read PCI vendor/device IDs; treating device as not keepalive-compatible" + ); + false + } + } +} + +/// Reads the sysfs `vendor` and `device` files for the given PCI device, +/// returning the trimmed contents (e.g. `"0x0100"`). +/// +/// Callers should invoke this once per device and cache the result, since +/// the values do not change for the lifetime of the device. +fn read_pci_vendor_device_ids(pci_id: &str) -> anyhow::Result<(String, String)> { + let devpath = std::path::Path::new("/sys/bus/pci/devices").join(pci_id); + let vendor = fs_err::read_to_string(devpath.join("vendor"))? + .trim_end() + .to_owned(); + let device = fs_err::read_to_string(devpath.join("device"))? + .trim_end() + .to_owned(); + + tracing::info!( + pci_id = %pci_id, + vendor = %vendor, + device = %device, + "read PCI vendor/device IDs" + ); + + Ok((vendor, device)) } diff --git a/vm/devices/storage/nvme_resources/src/fault.rs b/vm/devices/storage/nvme_resources/src/fault.rs index 8fcad86766..8ed05989b6 100644 --- a/vm/devices/storage/nvme_resources/src/fault.rs +++ b/vm/devices/storage/nvme_resources/src/fault.rs @@ -315,6 +315,42 @@ pub struct CommandMatch { pub mask: [u8; 64], } +/// A fault configuration for overriding the device's hardware configuration +/// (i.e. the values reported in PCI configuration space). +/// +/// When a field is `Some`, its value replaces the corresponding field reported +/// by the NVMe fault controller's PCI hardware IDs. When `None`, the real +/// hardware ID value is used unchanged. Unlike most other faults in this +/// module, hardware config faults are applied at device construction time +/// and are therefore not gated by the `fault_active` flag. +/// +/// # Example +/// Override the Vendor ID and Device ID reported in PCI config space. +/// ```no_run +/// use mesh::CellUpdater; +/// use nvme_resources::fault::FaultConfiguration; +/// use nvme_resources::fault::HardwareConfigFaultConfig; +/// +/// pub fn hardware_config_fault() -> FaultConfiguration { +/// let mut fault_start_updater = CellUpdater::new(false); +/// FaultConfiguration::new(fault_start_updater.cell()) +/// .with_hardware_config_fault( +/// HardwareConfigFaultConfig::new() +/// .with_vendor_id(0x1414) +/// .with_device_id(0x00a9), +/// ) +/// } +/// ``` +#[derive(Debug, Copy, Clone, Default, MeshPayload)] +pub struct HardwareConfigFaultConfig { + /// Override for the device's PCI Vendor ID. When `None`, the real Vendor + /// ID is used. + pub vendor_id: Option, + /// Override for the device's PCI Device ID. When `None`, the real Device + /// ID is used. + pub device_id: Option, +} + /// Fault configuration for the NVMe fault controller. /// /// This struct defines behaviors that inject faults into the NVMe fault controller logic, @@ -390,6 +426,8 @@ pub struct FaultConfiguration { pub namespace_fault: NamespaceFaultConfig, /// Fault to apply to all IO queues pub io_fault: Arc, + /// Fault to apply to the Hardware Configuration + pub hardware_config_fault: Option, } impl FaultConfiguration { @@ -404,6 +442,7 @@ impl FaultConfiguration { pci_fault: Some(PciFaultConfig::new()), namespace_fault: NamespaceFaultConfig::new(mesh::channel().1), io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)), + hardware_config_fault: None, } } @@ -430,6 +469,35 @@ impl FaultConfiguration { self.namespace_fault = namespace_fault; self } + + /// Add a hardware config fault configuration to the fault configuration + pub fn with_hardware_config_fault( + mut self, + hardware_config_fault: HardwareConfigFaultConfig, + ) -> Self { + self.hardware_config_fault = Some(hardware_config_fault); + self + } +} + +impl HardwareConfigFaultConfig { + /// Create a new no-op hardware config fault configuration. All fields + /// default to `None`, meaning the real hardware values are used. + pub fn new() -> Self { + Self::default() + } + + /// Override the PCI Vendor ID reported by the device. + pub fn with_vendor_id(mut self, vendor_id: u16) -> Self { + self.vendor_id = Some(vendor_id); + self + } + + /// Override the PCI Device ID reported by the device. + pub fn with_device_id(mut self, device_id: u16) -> Self { + self.device_id = Some(device_id); + self + } } impl PciFaultConfig { diff --git a/vm/devices/storage/nvme_test/src/pci.rs b/vm/devices/storage/nvme_test/src/pci.rs index 278793d3e8..396951f19b 100644 --- a/vm/devices/storage/nvme_test/src/pci.rs +++ b/vm/devices/storage/nvme_test/src/pci.rs @@ -137,10 +137,21 @@ impl NvmeFaultController { BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())), ); + // Apply any hardware-config fault overrides for the IDs reported in + // PCI configuration space, falling back to the real values when no + // override is configured. + let hardware_config_fault = fault_configuration.hardware_config_fault.take(); + let vendor_id = hardware_config_fault + .and_then(|f| f.vendor_id) + .unwrap_or(VENDOR_ID); + let device_id = hardware_config_fault + .and_then(|f| f.device_id) + .unwrap_or(0x00a9); + let cfg_space = ConfigSpaceType0Emulator::new( HardwareIds { - vendor_id: VENDOR_ID, - device_id: 0x00a9, + vendor_id, + device_id, revision_id: 0, prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME, sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY, diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index 3c654d51dc..c1055f3cc4 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -19,6 +19,7 @@ use nvme_resources::NvmeFaultControllerHandle; use nvme_resources::fault::AdminQueueFaultBehavior; use nvme_resources::fault::AdminQueueFaultConfig; use nvme_resources::fault::FaultConfiguration; +use nvme_resources::fault::HardwareConfigFaultConfig; use nvme_resources::fault::IoQueueFaultBehavior; use nvme_resources::fault::IoQueueFaultConfig; use nvme_resources::fault::NamespaceChange; @@ -1240,6 +1241,11 @@ async fn servicing_keepalive_per_device_gate( .build(), AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), ), + ) + .with_hardware_config_fault( + HardwareConfigFaultConfig::new() + .with_vendor_id(0x1414) + .with_device_id(0xb111), ); let scsi_instance = Guid::new_random(); @@ -1253,13 +1259,13 @@ async fn servicing_keepalive_per_device_gate( b.with_custom_config(move |c| { c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: WITH_KEEPALIVE_NVME_INSTANCE, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: WITH_KEEPALIVE_NSID, + nsid: NO_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1267,20 +1273,20 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: with_keepalive_fault_config, + fault_config: no_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), }); c.vpci_devices.push(VpciDeviceConfig { vtl: DeviceVtl::Vtl2, - instance_id: NO_KEEPALIVE_NVME_INSTANCE, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, resource: NvmeFaultControllerHandle { subsystem_id: Guid::new_random(), msix_count: 10, max_io_queues: 10, namespaces: vec![NamespaceDefinition { - nsid: NO_KEEPALIVE_NSID, + nsid: WITH_KEEPALIVE_NSID, read_only: false, disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { len: Some(DEFAULT_DISK_SIZE), @@ -1288,7 +1294,7 @@ async fn servicing_keepalive_per_device_gate( }) .into_resource(), }], - fault_config: no_keepalive_fault_config, + fault_config: with_keepalive_fault_config, enable_tdisp_tests: false, } .into_resource(), @@ -1300,20 +1306,20 @@ async fn servicing_keepalive_per_device_gate( .with_instance_id(scsi_instance) .add_lun( Vtl2LunBuilder::disk() - .with_location(WITH_KEEPALIVE_LUN) + .with_location(NO_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - WITH_KEEPALIVE_NVME_INSTANCE, - WITH_KEEPALIVE_NSID, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, )), ) .add_lun( Vtl2LunBuilder::disk() - .with_location(NO_KEEPALIVE_LUN) + .with_location(WITH_KEEPALIVE_LUN) .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( ControllerType::Nvme, - NO_KEEPALIVE_NVME_INSTANCE, - NO_KEEPALIVE_NSID, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, )), ) .build(), From dea6e9e93ff1320c793f5f90ddcd31f49944b00d Mon Sep 17 00:00:00 2001 From: Guramrit Singh Date: Mon, 4 May 2026 17:02:50 -0700 Subject: [PATCH 13/13] Minor updates to comments and such --- openhcl/underhill_core/src/nvme_manager/device.rs | 2 -- openhcl/underhill_core/src/nvme_manager/manager.rs | 6 ------ openhcl/underhill_core/src/nvme_manager/mod.rs | 4 ++-- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index e9bda747b1..cf57f37a75 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -334,8 +334,6 @@ enum NvmeDriverRequest { pub struct NvmeDriverManager { task: Task<()>, pci_id: String, - /// Whether the underlying device is compatible with NVMe keepalive - /// (cached at construction time so callers don't have to re-read sysfs). keepalive_compatible: bool, client: NvmeDriverManagerClient, } diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 13f86430a7..910a113f12 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -361,9 +361,6 @@ impl NvmeManagerWorker { match guard.entry(pci_id.clone()) { hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. hash_map::Entry::Vacant(entry) => { - // Read the PCI vendor/device IDs once and cache the - // resulting keepalive compatibility flag on the - // `NvmeDriverManager` for later save/shutdown queries. let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); let driver = NvmeDriverManager::new( @@ -472,9 +469,6 @@ impl NvmeManagerWorker { let mut restored_devices: HashMap = HashMap::new(); for disk in &saved_state.nvme_disks { - // Only restore disks that are known to be keepalive compatible. - // This should gracefully tear down state for non-keepalive devices - // before using the device. let pci_id = disk.pci_id.clone(); // Read the PCI vendor/device IDs once and cache the resulting diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index 09bd259ad2..9244f7ba50 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -68,11 +68,11 @@ pub struct NamespaceError { } /// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`), -/// for NVMe devices that are compatible with keepalive. +/// for NVMe devices that are incompatible with keepalive. const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; /// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), -/// for NVMe devices that are compatible with keepalive. +/// for NVMe devices that are incompatible with keepalive. const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; #[derive(Debug, Error)]