diff --git a/openhcl/openhcl_boot/src/cmdline.rs b/openhcl/openhcl_boot/src/cmdline.rs index a452b936e1..64239ecbce 100644 --- a/openhcl/openhcl_boot/src/cmdline.rs +++ b/openhcl/openhcl_boot/src/cmdline.rs @@ -132,7 +132,7 @@ impl BootCommandLineOptions { confidential_debug: false, enable_vtl2_gpa_pool: Vtl2GpaPoolConfig::Heuristics(Vtl2GpaPoolLookupTable::Release), // use the release config by default sidecar: SidecarOptions::default(), - disable_nvme_keep_alive: true, + disable_nvme_keep_alive: false, vtl2_gpa_pool_numa_split: false, } } @@ -141,9 +141,6 @@ impl BootCommandLineOptions { impl BootCommandLineOptions { /// Parse arguments from a command line. pub fn parse(&mut self, cmdline: &str) { - // Workaround for a host side issue: disable NVMe keepalive by default. - self.disable_nvme_keep_alive = true; - let mut override_vtl2_gpa_pool: Option = None; for arg in cmdline.split_whitespace() { if arg.starts_with(OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME) { @@ -186,8 +183,8 @@ impl BootCommandLineOptions { } } else if arg.starts_with(DISABLE_NVME_KEEP_ALIVE) { let arg = arg.split_once('=').map(|(_, arg)| arg); - if arg.is_some_and(|a| a == "0") { - self.disable_nvme_keep_alive = false; + if arg.is_some_and(|a| a != "0") { + self.disable_nvme_keep_alive = true; } } else if arg.starts_with(VTL2_GPA_POOL_NUMA) { if let Some((_, arg)) = arg.split_once('=') { diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 511639253f..0ac9610f83 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -444,16 +444,17 @@ impl LoadedVm { capabilities_flags, } = message; - // If the host provided timeout hint is >= uint16::max - // seconds, we treat that as a signal from the host that no + // If the host provided timeout hint is >= 59 seconds, + // we treat that as a signal from the host that no // timeout duration was set. We instead limit servicing to - // 200s in that case. - let timeout_hint = if timeout_hint >= Duration::from_secs(u16::MAX as u64) { + // 59s in that case. + let timeout_hint = if timeout_hint >= Duration::from_secs(59) { tracing::info!( CVM_ALLOWED, - "host provided UINT16_MAX timeout hint, defaulting to 200s" + host_timeout_hint_ms = timeout_hint.as_millis() as u64, + "host provided timeout hint > 59s, defaulting to 59s" ); - Duration::from_secs(200) + Duration::from_secs(59) } else { timeout_hint }; diff --git a/openhcl/underhill_core/src/nvme_manager/device.rs b/openhcl/underhill_core/src/nvme_manager/device.rs index 3fb4892297..fd80795268 100644 --- a/openhcl/underhill_core/src/nvme_manager/device.rs +++ b/openhcl/underhill_core/src/nvme_manager/device.rs @@ -334,6 +334,7 @@ enum NvmeDriverRequest { pub struct NvmeDriverManager { task: Task<()>, pci_id: String, + keepalive_compatible: bool, client: NvmeDriverManagerClient, } @@ -358,12 +359,18 @@ impl NvmeDriverManager { &self.client } + /// Returns whether the underlying device is compatible with NVMe keepalive. + pub fn keepalive_compatible(&self) -> bool { + self.keepalive_compatible + } + /// Creates the [`NvmeDriverManager`]. pub fn new( driver_source: &VmTaskDriverSource, pci_id: &str, vp_count: u32, save_restore_supported: bool, + keepalive_compatible: bool, device: Option>, nvme_driver_spawner: Arc, ) -> anyhow::Result { @@ -382,6 +389,7 @@ impl NvmeDriverManager { Ok(Self { task, pci_id: pci_id.into(), + keepalive_compatible, client: NvmeDriverManagerClient { pci_id: pci_id.into(), sender: send, @@ -557,6 +565,7 @@ impl NvmeDriverManagerWorker { } NvmeDriverRequest::Shutdown(rpc) => { rpc.handle(async |(_span, options)| { + tracing::info!(pci_id = %self.pci_id, do_not_reset = %options.do_not_reset, skip_device_shutdown = %options.skip_device_shutdown, "nvme device manager worker shutdown called"); // Driver may be `None` here if there was a failure during driver creation. // In that case, we just skip the shutdown rather than panic. match self.driver.take() { diff --git a/openhcl/underhill_core/src/nvme_manager/manager.rs b/openhcl/underhill_core/src/nvme_manager/manager.rs index 1b59109fec..910a113f12 100644 --- a/openhcl/underhill_core/src/nvme_manager/manager.rs +++ b/openhcl/underhill_core/src/nvme_manager/manager.rs @@ -5,6 +5,7 @@ use crate::nvme_manager::CreateNvmeDriver; use crate::nvme_manager::device::NvmeDriverManager; use crate::nvme_manager::device::NvmeDriverManagerClient; use crate::nvme_manager::device::NvmeDriverShutdownOptions; +use crate::nvme_manager::is_nvme_keepalive_compatible; use crate::nvme_manager::save_restore::NvmeManagerSavedState; use crate::nvme_manager::save_restore::NvmeSavedDiskConfig; use crate::servicing::NvmeSavedState; @@ -301,12 +302,17 @@ impl NvmeManagerWorker { async { join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { + let keepalive_compatible = driver.keepalive_compatible(); driver .shutdown(NvmeDriverShutdownOptions { // nvme_keepalive is received from host but it is only valid // when memory pool allocator supports save/restore. - do_not_reset: nvme_keepalive && self.context.save_restore_supported, - skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported, + do_not_reset: nvme_keepalive + && self.context.save_restore_supported + && keepalive_compatible, + skip_device_shutdown: nvme_keepalive + && self.context.save_restore_supported + && keepalive_compatible, }) .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) })) @@ -355,11 +361,14 @@ impl NvmeManagerWorker { match guard.entry(pci_id.clone()) { hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. hash_map::Entry::Vacant(entry) => { + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let driver = NvmeDriverManager::new( &context.driver_source, &pci_id, context.vp_count, - context.save_restore_supported, + context.save_restore_supported && keepalive_compatible, + keepalive_compatible, None, // No device yet, context.nvme_driver_spawner.clone(), )?; @@ -419,12 +428,24 @@ impl NvmeManagerWorker { /// Saves NVMe device's states into buffer during servicing. pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); + // Only save state for devices known to be keepalive-compatible. let mut devices_to_save: HashMap = self .context .devices .write() .iter() - .map(|(pci_id, driver)| (pci_id.clone(), driver.client().clone())) + .filter_map(|(pci_id, driver)| { + if driver.keepalive_compatible() { + Some((pci_id.clone(), driver.client().clone())) + } else { + tracing::info!( + %pci_id, + "skipping save of nvme device; \ + keepalive disabled for this device" + ); + None + } + }) .collect(); for (pci_id, client) in devices_to_save.iter_mut() { nvme_disks.push(NvmeSavedDiskConfig { @@ -449,6 +470,11 @@ impl NvmeManagerWorker { for disk in &saved_state.nvme_disks { let pci_id = disk.pci_id.clone(); + + // Read the PCI vendor/device IDs once and cache the resulting + // keepalive compatibility flag on the `NvmeDriverManager`. + let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); + let nvme_driver = self .context .nvme_driver_spawner @@ -456,7 +482,7 @@ impl NvmeManagerWorker { &self.context.driver_source, &pci_id, saved_state.cpu_count, - save_restore_supported, + save_restore_supported && keepalive_compatible, Some(&disk.driver_state), ) .await?; @@ -468,6 +494,7 @@ impl NvmeManagerWorker { &pci_id, self.context.vp_count, true, // save_restore_supported is always `true` when restoring. + keepalive_compatible, Some(nvme_driver), self.context.nvme_driver_spawner.clone(), )?, @@ -1068,9 +1095,16 @@ mod tests { )); // Create a driver manager - let driver_manager = - NvmeDriverManager::new(&driver_source, "0000:00:04.0", 4, false, None, spawner) - .unwrap(); + let driver_manager = NvmeDriverManager::new( + &driver_source, + "0000:00:04.0", + 4, + false, + false, + None, + spawner, + ) + .unwrap(); let client = driver_manager.client().clone(); @@ -1154,6 +1188,7 @@ mod tests { "0000:00:05.0", 4, true, // save_restore_supported + true, None, spawner, ) diff --git a/openhcl/underhill_core/src/nvme_manager/mod.rs b/openhcl/underhill_core/src/nvme_manager/mod.rs index cc8977de45..9244f7ba50 100644 --- a/openhcl/underhill_core/src/nvme_manager/mod.rs +++ b/openhcl/underhill_core/src/nvme_manager/mod.rs @@ -67,6 +67,14 @@ pub struct NamespaceError { source: NvmeSpawnerError, } +/// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`), +/// for NVMe devices that are incompatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414"; + +/// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`), +/// for NVMe devices that are incompatible with keepalive. +const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111"; + #[derive(Debug, Error)] pub enum NvmeSpawnerError { #[error("failed to initialize vfio device")] @@ -111,3 +119,45 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync { saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>, ) -> Result, NvmeSpawnerError>; } + +/// Returns whether the given PCI device is compatible with NVMe keepalive. +pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool { + match read_pci_vendor_device_ids(&pci_id) { + Ok((vendor_id, device_id)) => { + vendor_id != KEEPALIVE_INCOMPATIBLE_VENDOR_ID + || device_id != KEEPALIVE_INCOMPATIBLE_DEVICE_ID + } + Err(err) => { + tracing::warn!( + pci_id = %pci_id, + error = err.as_ref() as &dyn std::error::Error, + "failed to read PCI vendor/device IDs; treating device as not keepalive-compatible" + ); + false + } + } +} + +/// Reads the sysfs `vendor` and `device` files for the given PCI device, +/// returning the trimmed contents (e.g. `"0x0100"`). +/// +/// Callers should invoke this once per device and cache the result, since +/// the values do not change for the lifetime of the device. +fn read_pci_vendor_device_ids(pci_id: &str) -> anyhow::Result<(String, String)> { + let devpath = std::path::Path::new("/sys/bus/pci/devices").join(pci_id); + let vendor = fs_err::read_to_string(devpath.join("vendor"))? + .trim_end() + .to_owned(); + let device = fs_err::read_to_string(devpath.join("device"))? + .trim_end() + .to_owned(); + + tracing::info!( + pci_id = %pci_id, + vendor = %vendor, + device = %device, + "read PCI vendor/device IDs" + ); + + Ok((vendor, device)) +} diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 25bb4472df..cc5c2e2916 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -831,6 +831,7 @@ impl NvmeDriver { tracing::info!( id = a.qid, pending_commands_count = a.handler_data.pending_cmds.commands.len(), + base_pfn = a.base_pfn, ?pci_id, "restoring admin queue", ); @@ -914,9 +915,10 @@ impl NvmeDriver { .io .iter() .map(|io_state| format!( - "{{qid={}, pending_commands_count={}}}", + "{{qid={}, pending_commands_count={}, base_pfn={}}}", io_state.queue_data.qid, - io_state.queue_data.handler_data.pending_cmds.commands.len() + io_state.queue_data.handler_data.pending_cmds.commands.len(), + io_state.queue_data.base_pfn, )) .collect::>() .join(", "), @@ -1270,7 +1272,7 @@ async fn handle_asynchronous_events( impl Drop for NvmeDriver { fn drop(&mut self) { - tracing::trace!(pci_id = ?self.device_id, ka = self.nvme_keepalive, task = self.task.is_some(), "dropping nvme driver"); + tracing::info!(pci_id = ?self.device_id, ka = self.nvme_keepalive, task = self.task.is_some(), "dropping nvme driver"); if self.task.is_some() { // Do not reset NVMe device when nvme_keepalive is requested. tracing::debug!(nvme_keepalive = self.nvme_keepalive, pci_id = ?self.device_id, "dropping nvme driver"); @@ -1308,32 +1310,29 @@ impl AsyncRun for DriverWorkerTask { stop: &mut task_control::StopTask<'_>, state: &mut WorkerState, ) -> Result<(), task_control::Cancelled> { - let r = stop - .until_stopped(async { - loop { - match self.recv.next().await { - Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { - rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) - .await - } - Some(NvmeWorkerRequest::Save(rpc)) => { - rpc.handle(async |span| { - let child_span = tracing::info_span!( - parent: &span, - "nvme_worker_save", - pci_id = %self.device.id() - ); - self.save(state).instrument(child_span).await - }) - .await - } - None => break, - } + loop { + let cmd = stop.until_stopped(self.recv.next()).await?; + match cmd { + Some(NvmeWorkerRequest::CreateIssuer(rpc)) => { + rpc.handle(async |cpu| self.create_io_issuer(state, cpu).await) + .await } - }) - .await; - tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting"); - r + Some(NvmeWorkerRequest::Save(rpc)) => { + rpc.handle(async |span| { + let child_span = tracing::info_span!( + parent: &span, + "nvme_worker_save", + pci_id = %self.device.id() + ); + self.save(state).instrument(child_span).await + }) + .await + } + None => break, + } + } + tracing::info!(pci_id = %self.device.id(), "nvme worker task exiting cleanly"); + Ok(()) } } @@ -1398,7 +1397,7 @@ impl DriverWorkerTask { } async fn create_io_issuer(&mut self, state: &mut WorkerState, cpu: u32) { - tracing::debug!(cpu, pci_id = ?self.device.id(), "issuer request"); + tracing::info!(cpu, pci_id = ?self.device.id(), "create io issuer request"); if self.io_issuers.per_cpu[cpu as usize].get().is_some() { return; } @@ -1496,7 +1495,7 @@ impl DriverWorkerTask { let iv = qid - 1; self.next_ioq_id += 1; - tracing::debug!(cpu, qid, iv, pci_id = ?self.device.id(), "creating io queue"); + tracing::info!(cpu, qid, iv, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: creating io queue"); let interrupt = self .device @@ -1516,7 +1515,7 @@ impl DriverWorkerTask { qid, cpu, pci_id = ?self.device.id(), - "created io queue in SelfDrained state" + "create_nvme_io_queue_fn: created io queue in SelfDrained state" ); } @@ -1533,6 +1532,7 @@ impl DriverWorkerTask { drain_after_restore, ) .map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?; + tracing::info!(cpu, qid, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: created the internal queue pair object."); assert_eq!(queue.sq_entries(), queue.cq_entries()); state.qsize = queue.sq_entries(); @@ -1599,6 +1599,7 @@ impl DriverWorkerTask { }; if let Err(err) = r.await { + tracing::info!(cpu, qid, pci_id = ?self.device.id(), created_completion_queue, error = ?err, "create_nvme_io_queue_fn: failed to create IO queue, tearing down and deleting internal resources."); if created_completion_queue { if let Err(err) = admin .issue_raw(spec::Command { @@ -1619,6 +1620,7 @@ impl DriverWorkerTask { return Err(DeviceError::Other(err)); } + tracing::info!(cpu, qid, pci_id = ?self.device.id(), "create_nvme_io_queue_fn: successfuly created io queue."); Ok(IoIssuer { issuer: io_queue.queue.issuer().clone(), cpu, @@ -1703,6 +1705,7 @@ impl DriverWorkerTask { pci_id = ?self.device.id(), id = admin_state.qid, pending_commands_count = admin_state.handler_data.pending_cmds.commands.len(), + pfn = admin_state.base_pfn, "saved admin queue", ); Some(admin_state) @@ -1758,9 +1761,10 @@ impl DriverWorkerTask { state = io .iter() .map(|io_state| format!( - "{{qid={}, pending_commands_count={}}}", + "{{qid={}, pending_commands_count={}, base_pfn={}}}", io_state.queue_data.qid, - io_state.queue_data.handler_data.pending_cmds.commands.len() + io_state.queue_data.handler_data.pending_cmds.commands.len(), + io_state.queue_data.base_pfn )) .collect::>() .join(", "), diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 73fc432a53..5b6eb831b2 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -93,11 +93,12 @@ impl PendingCommands { const MAX_CIDS: usize = 1 << Self::CID_KEY_BITS; const CID_SEQ_OFFSET: Wrapping = Wrapping(1 << Self::CID_KEY_BITS); - fn new(qid: u16) -> Self { + fn new(qid: u16, device_id: String) -> Self { Self { commands: Slab::new(), next_cid_high_bits: Wrapping(0), qid, + device_id, } } @@ -124,7 +125,19 @@ impl PendingCommands { entry.insert(PendingCommand { command: *command, respond, - submitted_at: (self.qid == 0).then(Instant::now), + submitted_at: (self.qid == 0).then(|| { + tracing::info!( + pci_id = %self.device_id, + ?cid, + opcode = ?command.cdw0.opcode(), + opname = ?spec::AdminOpcode(command.cdw0.opcode()), + nsid = ?command.nsid, + cdw10 = command.cdw10, + cdw11 = command.cdw11, + "inserted admin command", + ); + Instant::now() + }), }); } @@ -140,6 +153,19 @@ impl PendingCommands { self.qid, command.command.cdw0.opcode(), ); + if let Some(submitted_at) = command.submitted_at { + tracing::info!( + pci_id = %self.device_id, + ?cid, + opcode = ?command.command.cdw0.opcode(), + opname = ?spec::AdminOpcode(command.command.cdw0.opcode()), + nsid = ?command.command.nsid, + cdw10 = command.command.cdw10, + cdw11 = command.command.cdw11, + elapsed = ?submitted_at.elapsed(), + "completed admin command", + ); + } command.respond } @@ -161,7 +187,11 @@ impl PendingCommands { } /// Restore pending commands from the saved state. - pub fn restore(saved_state: &PendingCommandsSavedState, qid: u16) -> anyhow::Result { + pub fn restore( + saved_state: &PendingCommandsSavedState, + qid: u16, + device_id: String, + ) -> anyhow::Result { let PendingCommandsSavedState { commands, next_cid_high_bits, @@ -188,6 +218,7 @@ impl PendingCommands { .collect::>(), next_cid_high_bits: Wrapping(*next_cid_high_bits), qid, + device_id, }) } } @@ -508,7 +539,7 @@ impl QueuePair { QueueHandler { sq: SubmissionQueue::new(qid, sq_entries, sq_mem_block), cq: CompletionQueue::new(qid, cq_entries, cq_mem_block), - commands: PendingCommands::new(qid), + commands: PendingCommands::new(qid, device_id.into()), stats: Default::default(), drain_after_restore, aer_handler, @@ -947,6 +978,7 @@ struct PendingCommands { #[inspect(hex)] next_cid_high_bits: Wrapping, qid: u16, + device_id: String, } #[derive(Inspect)] @@ -1360,7 +1392,7 @@ impl QueueHandler { Ok(Self { sq: SubmissionQueue::restore(sq_mem_block, sq_state)?, cq: CompletionQueue::restore(cq_mem_block, cq_state)?, - commands: PendingCommands::restore(pending_cmds, sq_state.sqid)?, + commands: PendingCommands::restore(pending_cmds, sq_state.sqid, device_id.into())?, stats: Default::default(), // Only drain pending commands for I/O queues. // Admin queue is expected to have pending Async Event requests. diff --git a/vm/devices/storage/nvme_resources/src/fault.rs b/vm/devices/storage/nvme_resources/src/fault.rs index 8fcad86766..8ed05989b6 100644 --- a/vm/devices/storage/nvme_resources/src/fault.rs +++ b/vm/devices/storage/nvme_resources/src/fault.rs @@ -315,6 +315,42 @@ pub struct CommandMatch { pub mask: [u8; 64], } +/// A fault configuration for overriding the device's hardware configuration +/// (i.e. the values reported in PCI configuration space). +/// +/// When a field is `Some`, its value replaces the corresponding field reported +/// by the NVMe fault controller's PCI hardware IDs. When `None`, the real +/// hardware ID value is used unchanged. Unlike most other faults in this +/// module, hardware config faults are applied at device construction time +/// and are therefore not gated by the `fault_active` flag. +/// +/// # Example +/// Override the Vendor ID and Device ID reported in PCI config space. +/// ```no_run +/// use mesh::CellUpdater; +/// use nvme_resources::fault::FaultConfiguration; +/// use nvme_resources::fault::HardwareConfigFaultConfig; +/// +/// pub fn hardware_config_fault() -> FaultConfiguration { +/// let mut fault_start_updater = CellUpdater::new(false); +/// FaultConfiguration::new(fault_start_updater.cell()) +/// .with_hardware_config_fault( +/// HardwareConfigFaultConfig::new() +/// .with_vendor_id(0x1414) +/// .with_device_id(0x00a9), +/// ) +/// } +/// ``` +#[derive(Debug, Copy, Clone, Default, MeshPayload)] +pub struct HardwareConfigFaultConfig { + /// Override for the device's PCI Vendor ID. When `None`, the real Vendor + /// ID is used. + pub vendor_id: Option, + /// Override for the device's PCI Device ID. When `None`, the real Device + /// ID is used. + pub device_id: Option, +} + /// Fault configuration for the NVMe fault controller. /// /// This struct defines behaviors that inject faults into the NVMe fault controller logic, @@ -390,6 +426,8 @@ pub struct FaultConfiguration { pub namespace_fault: NamespaceFaultConfig, /// Fault to apply to all IO queues pub io_fault: Arc, + /// Fault to apply to the Hardware Configuration + pub hardware_config_fault: Option, } impl FaultConfiguration { @@ -404,6 +442,7 @@ impl FaultConfiguration { pci_fault: Some(PciFaultConfig::new()), namespace_fault: NamespaceFaultConfig::new(mesh::channel().1), io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)), + hardware_config_fault: None, } } @@ -430,6 +469,35 @@ impl FaultConfiguration { self.namespace_fault = namespace_fault; self } + + /// Add a hardware config fault configuration to the fault configuration + pub fn with_hardware_config_fault( + mut self, + hardware_config_fault: HardwareConfigFaultConfig, + ) -> Self { + self.hardware_config_fault = Some(hardware_config_fault); + self + } +} + +impl HardwareConfigFaultConfig { + /// Create a new no-op hardware config fault configuration. All fields + /// default to `None`, meaning the real hardware values are used. + pub fn new() -> Self { + Self::default() + } + + /// Override the PCI Vendor ID reported by the device. + pub fn with_vendor_id(mut self, vendor_id: u16) -> Self { + self.vendor_id = Some(vendor_id); + self + } + + /// Override the PCI Device ID reported by the device. + pub fn with_device_id(mut self, device_id: u16) -> Self { + self.device_id = Some(device_id); + self + } } impl PciFaultConfig { diff --git a/vm/devices/storage/nvme_test/src/pci.rs b/vm/devices/storage/nvme_test/src/pci.rs index 278793d3e8..396951f19b 100644 --- a/vm/devices/storage/nvme_test/src/pci.rs +++ b/vm/devices/storage/nvme_test/src/pci.rs @@ -137,10 +137,21 @@ impl NvmeFaultController { BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())), ); + // Apply any hardware-config fault overrides for the IDs reported in + // PCI configuration space, falling back to the real values when no + // override is configured. + let hardware_config_fault = fault_configuration.hardware_config_fault.take(); + let vendor_id = hardware_config_fault + .and_then(|f| f.vendor_id) + .unwrap_or(VENDOR_ID); + let device_id = hardware_config_fault + .and_then(|f| f.device_id) + .unwrap_or(0x00a9); + let cfg_space = ConfigSpaceType0Emulator::new( HardwareIds { - vendor_id: VENDOR_ID, - device_id: 0x00a9, + vendor_id, + device_id, revision_id: 0, prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME, sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY, diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index e3c74aa96f..997188c5fe 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -19,6 +19,7 @@ use nvme_resources::NvmeFaultControllerHandle; use nvme_resources::fault::AdminQueueFaultBehavior; use nvme_resources::fault::AdminQueueFaultConfig; use nvme_resources::fault::FaultConfiguration; +use nvme_resources::fault::HardwareConfigFaultConfig; use nvme_resources::fault::IoQueueFaultBehavior; use nvme_resources::fault::IoQueueFaultConfig; use nvme_resources::fault::NamespaceChange; @@ -147,9 +148,7 @@ async fn servicing_keepalive_no_device( ) -> anyhow::Result<()> { let flags = config.default_servicing_flags(); openhcl_servicing_core( - config.with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ), + config.with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"), igvm_file, flags, DEFAULT_SERVICING_COUNT, @@ -198,9 +197,7 @@ async fn servicing_keepalive_with_device( let flags = config.default_servicing_flags(); openhcl_servicing_core( config - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .with_boot_device_type(petri::BootDeviceType::ScsiViaNvme) .with_vmbus_redirect(true), // Need this to attach the NVMe device igvm_file, @@ -1169,6 +1166,196 @@ async fn servicing_keepalive_slow_create_io_queue( Ok(()) } +/// Verifies the per-device NVMe keepalive gate. +/// +/// Two NVMe controllers are attached to a single VM: +/// * A "c05b" device whose VPCI instance ID has `data2 = 0xc05b`, which +/// makes its derived pci_id start with `c05b:` (matching the +/// keepalive-compatible heuristic). Keepalive must be honored for +/// this device — its controller is not reset across servicing, so +/// `CREATE_IO_COMPLETION_QUEUE` must NOT be issued after servicing. +/// The fault panics if the opcode is observed. +/// * A device whose pci_id does not start with +/// `c05b:`. Keepalive must be downgraded to reset-on-servicing for +/// this device, so `CREATE_IO_COMPLETION_QUEUE` MUST be issued after +/// servicing. The fault verifies (via a oneshot) that the opcode is +/// observed. +/// +/// Keepalive is enabled VM-wide; the gate is expected to apply per +/// device based on its pci_id. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn servicing_keepalive_per_device_gate( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let mut flags = config.default_servicing_flags(); + flags.enable_nvme_keepalive = true; + + // VPCI instance IDs. The pci_id seen by the NVMe driver is derived + // as `format!("{:04x}:00:00.0", instance_id.data2)` (see + // `vpci_path` in `dispatch/vtl2_settings_worker.rs`), so `data2` + // controls the prefix of the pci_id. For the device with keepalive we set data2 = 0xc05b + // so its pci_id is `c05b:00:00.0`. + const WITH_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("00000000-c05b-0000-0000-000000000001"); + const NO_KEEPALIVE_NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); + + const WITH_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID; + const NO_KEEPALIVE_NSID: u32 = KEEPALIVE_VTL2_NSID + 1; + const WITH_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN; + const NO_KEEPALIVE_LUN: u32 = VTL0_NVME_LUN + 1; + + // Two independent fault start cells — one per device. + let mut with_keepalive_fault_updater = CellUpdater::new(false); + let mut no_keepalive_fault_updater = CellUpdater::new(false); + + let (no_keepalive_create_seen_send, no_keepalive_create_seen_recv) = mesh::oneshot::<()>(); + + // c05b device: keepalive must be honored — fail loudly if any + // CREATE_IO_COMPLETION_QUEUE is observed after the fault is armed. + let with_keepalive_fault_config = FaultConfiguration::new(with_keepalive_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Panic( + "c05b device received CREATE_IO_COMPLETION_QUEUE after servicing — \ + keepalive should have been honored for this device but the controller \ + was reset." + .to_string(), + ), + ), + ); + + // Non c05b device: keepalive must be downgraded — + // verify CREATE_IO_COMPLETION_QUEUE IS issued after servicing. + let no_keepalive_fault_config = FaultConfiguration::new(no_keepalive_fault_updater.cell()) + .with_admin_queue_fault( + AdminQueueFaultConfig::new().with_submission_queue_fault( + CommandMatchBuilder::new() + .match_cdw0_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0) + .build(), + AdminQueueFaultBehavior::Verify(Some(no_keepalive_create_seen_send)), + ), + ) + .with_hardware_config_fault( + HardwareConfigFaultConfig::new() + .with_vendor_id(0x1414) + .with_device_id(0xb111), + ); + + let scsi_instance = Guid::new_random(); + + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + ) + .modify_backend(move |b| { + b.with_custom_config(move |c| { + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: NO_KEEPALIVE_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: NO_KEEPALIVE_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: no_keepalive_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + c.vpci_devices.push(VpciDeviceConfig { + vtl: DeviceVtl::Vtl2, + instance_id: WITH_KEEPALIVE_NVME_INSTANCE, + resource: NvmeFaultControllerHandle { + subsystem_id: Guid::new_random(), + msix_count: 10, + max_io_queues: 10, + namespaces: vec![NamespaceDefinition { + nsid: WITH_KEEPALIVE_NSID, + read_only: false, + disk: LayeredDiskHandle::single_layer(RamDiskLayerHandle { + len: Some(DEFAULT_DISK_SIZE), + sector_size: None, + }) + .into_resource(), + }], + fault_config: with_keepalive_fault_config, + enable_tdisp_tests: false, + } + .into_resource(), + }); + }) + }) + .add_vtl2_storage_controller( + Vtl2StorageControllerBuilder::new(ControllerType::Scsi) + .with_instance_id(scsi_instance) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(NO_KEEPALIVE_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + NO_KEEPALIVE_NVME_INSTANCE, + NO_KEEPALIVE_NSID, + )), + ) + .add_lun( + Vtl2LunBuilder::disk() + .with_location(WITH_KEEPALIVE_LUN) + .with_physical_device(Vtl2StorageBackingDeviceBuilder::new( + ControllerType::Nvme, + WITH_KEEPALIVE_NVME_INSTANCE, + WITH_KEEPALIVE_NSID, + )), + ) + .build(), + ) + .run() + .await?; + + agent.ping().await?; + + // Arm both faults BEFORE servicing so that the post-servicing + // CREATE_IO_COMPLETION_QUEUE is what each fault matches. + with_keepalive_fault_updater.set(true).await; + no_keepalive_fault_updater.set(true).await; + + vm.restart_openhcl(igvm_file.clone(), flags).await?; + + agent.ping().await?; + + // The non-c05b device must issue CREATE_IO_COMPLETION_QUEUE after + // servicing because its keepalive was downgraded to reset. + CancelContext::new() + .with_timeout(Duration::from_secs(60)) + .until_cancelled(no_keepalive_create_seen_recv) + .await + .expect( + "non-c05b NVMe device did not issue CREATE_IO_COMPLETION_QUEUE within 60s after \ + servicing — the per-device keepalive gate did not downgrade keepalive for this \ + device.", + ) + .expect("CREATE_IO_COMPLETION_QUEUE verification on non-c05b device failed"); + + // If the c05b device had received CREATE_IO_COMPLETION_QUEUE, the + // panic fault would have crashed the VM and failed this test. We + // disarm the faults defensively before tearing down. + with_keepalive_fault_updater.set(false).await; + no_keepalive_fault_updater.set(false).await; + + Ok(()) +} + async fn apply_fault_with_keepalive( config: PetriVmBuilder, fault_configuration: FaultConfiguration, @@ -1219,9 +1406,7 @@ async fn create_keepalive_test_config_default( config .with_vmbus_redirect(true) - .with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512") .modify_backend(move |b| { b.with_custom_config(|c| { // Add a fault controller to test the nvme controller functionality @@ -1309,9 +1494,9 @@ async fn create_keepalive_test_config_custom( ) -> Result<(PetriVm, PipetteClient), anyhow::Error> { const NVME_INSTANCE: Guid = guid::guid!("dce4ebad-182f-46c0-8d30-8446c1c62ab3"); - let mut builder = config.with_vmbus_redirect(true).with_openhcl_command_line( - "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", - ); + let mut builder = config + .with_vmbus_redirect(true) + .with_openhcl_command_line("OPENHCL_ENABLE_VTL2_GPA_POOL=512"); for cmdline in extra_cmdlines { builder = builder.with_openhcl_command_line(cmdline); diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs index b43f0ec855..593e78dcc6 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs @@ -193,7 +193,7 @@ async fn nvme_relay_explicit_private_pool( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=512", expected_props: Some(ExpectedNvmeDeviceProperties { save_restore_supported: true, qsize: 256, // private pool should allow contiguous allocations. @@ -218,7 +218,7 @@ async fn nvme_relay_heuristic_debug_16vp_768mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024), @@ -246,7 +246,7 @@ async fn nvme_relay_heuristic_release_16vp_256mb_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(256 * 1024 * 1024), @@ -277,7 +277,7 @@ async fn nvme_relay_heuristic_release_32vp_500mb_very_heavy( nvme_relay_test_core( config, NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0", + openhcl_cmdline: "", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(500 * 1024 * 1024), @@ -307,7 +307,7 @@ async fn nvme_relay_32vp_768mb_very_heavy( ..Default::default() }), NvmeRelayTestParams { - openhcl_cmdline: "OPENHCL_DISABLE_NVME_KEEP_ALIVE=0 OPENHCL_ENABLE_VTL2_GPA_POOL=10240", + openhcl_cmdline: "OPENHCL_ENABLE_VTL2_GPA_POOL=10240", processor_topology: Some(ProcessorTopology::very_heavy()), vtl2_base_address_type: Some(openvmm_defs::config::Vtl2BaseAddressType::Vtl2Allocate { size: Some(768 * 1024 * 1024),