Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions openhcl/openhcl_boot/src/cmdline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ impl BootCommandLineOptions {
confidential_debug: false,
enable_vtl2_gpa_pool: Vtl2GpaPoolConfig::Heuristics(Vtl2GpaPoolLookupTable::Release), // use the release config by default
sidecar: SidecarOptions::default(),
disable_nvme_keep_alive: true,
disable_nvme_keep_alive: false,
vtl2_gpa_pool_numa_split: false,
}
}
Expand All @@ -141,9 +141,6 @@ impl BootCommandLineOptions {
impl BootCommandLineOptions {
/// Parse arguments from a command line.
pub fn parse(&mut self, cmdline: &str) {
// Workaround for a host side issue: disable NVMe keepalive by default.
self.disable_nvme_keep_alive = true;

let mut override_vtl2_gpa_pool: Option<Vtl2GpaPoolConfig> = None;
for arg in cmdline.split_whitespace() {
if arg.starts_with(OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME) {
Expand Down Expand Up @@ -186,8 +183,8 @@ impl BootCommandLineOptions {
}
} else if arg.starts_with(DISABLE_NVME_KEEP_ALIVE) {
let arg = arg.split_once('=').map(|(_, arg)| arg);
if arg.is_some_and(|a| a == "0") {
self.disable_nvme_keep_alive = false;
if arg.is_some_and(|a| a != "0") {
self.disable_nvme_keep_alive = true;
}
} else if arg.starts_with(VTL2_GPA_POOL_NUMA) {
if let Some((_, arg)) = arg.split_once('=') {
Expand Down
8 changes: 8 additions & 0 deletions openhcl/underhill_core/src/nvme_manager/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ enum NvmeDriverRequest {
pub struct NvmeDriverManager {
task: Task<()>,
pci_id: String,
keepalive_compatible: bool,
client: NvmeDriverManagerClient,
}

Expand All @@ -358,12 +359,18 @@ impl NvmeDriverManager {
&self.client
}

/// Returns whether the underlying device is compatible with NVMe keepalive.
pub fn keepalive_compatible(&self) -> bool {
self.keepalive_compatible
}

/// Creates the [`NvmeDriverManager`].
pub fn new(
driver_source: &VmTaskDriverSource,
pci_id: &str,
vp_count: u32,
save_restore_supported: bool,
keepalive_compatible: bool,
device: Option<Box<dyn NvmeDevice>>,
nvme_driver_spawner: Arc<dyn CreateNvmeDriver>,
) -> anyhow::Result<Self> {
Expand All @@ -382,6 +389,7 @@ impl NvmeDriverManager {
Ok(Self {
task,
pci_id: pci_id.into(),
keepalive_compatible,
client: NvmeDriverManagerClient {
pci_id: pci_id.into(),
sender: send,
Expand Down
51 changes: 43 additions & 8 deletions openhcl/underhill_core/src/nvme_manager/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::nvme_manager::CreateNvmeDriver;
use crate::nvme_manager::device::NvmeDriverManager;
use crate::nvme_manager::device::NvmeDriverManagerClient;
use crate::nvme_manager::device::NvmeDriverShutdownOptions;
use crate::nvme_manager::is_nvme_keepalive_compatible;
use crate::nvme_manager::save_restore::NvmeManagerSavedState;
use crate::nvme_manager::save_restore::NvmeSavedDiskConfig;
use crate::servicing::NvmeSavedState;
Expand Down Expand Up @@ -301,12 +302,17 @@ impl NvmeManagerWorker {

async {
join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| {
let keepalive_compatible = driver.keepalive_compatible();
driver
.shutdown(NvmeDriverShutdownOptions {
// nvme_keepalive is received from host but it is only valid
// when memory pool allocator supports save/restore.
do_not_reset: nvme_keepalive && self.context.save_restore_supported,
skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported,
do_not_reset: nvme_keepalive
&& self.context.save_restore_supported
&& keepalive_compatible,
skip_device_shutdown: nvme_keepalive
&& self.context.save_restore_supported
&& keepalive_compatible,
})
Comment thread
gurasinghMS marked this conversation as resolved.
.instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id))
}))
Expand Down Expand Up @@ -355,11 +361,14 @@ impl NvmeManagerWorker {
match guard.entry(pci_id.clone()) {
hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist.
hash_map::Entry::Vacant(entry) => {
let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id);

Comment on lines 361 to +365
let driver = NvmeDriverManager::new(
&context.driver_source,
&pci_id,
context.vp_count,
context.save_restore_supported,
context.save_restore_supported && keepalive_compatible,
Comment on lines +364 to +370
keepalive_compatible,
None, // No device yet,
context.nvme_driver_spawner.clone(),
)?;
Expand Down Expand Up @@ -419,12 +428,24 @@ impl NvmeManagerWorker {
/// Saves NVMe device's states into buffer during servicing.
pub async fn save(&mut self) -> anyhow::Result<NvmeManagerSavedState> {
let mut nvme_disks: Vec<NvmeSavedDiskConfig> = Vec::new();
// Only save state for devices known to be keepalive-compatible.
let mut devices_to_save: HashMap<String, NvmeDriverManagerClient> = self
.context
.devices
.write()
.iter()
Comment on lines 432 to 436
.map(|(pci_id, driver)| (pci_id.clone(), driver.client().clone()))
.filter_map(|(pci_id, driver)| {
if driver.keepalive_compatible() {
Some((pci_id.clone(), driver.client().clone()))
} else {
tracing::info!(
%pci_id,
"skipping save of nvme device; \
keepalive disabled for this device"
);
None
}
})
Comment thread
gurasinghMS marked this conversation as resolved.
Comment on lines +437 to +448
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense. I don't know how long the logging prolongs the time inside the lock (which lock? I don't see locks in this function), so don't know if we should fix this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what lock it is talking about either. But unless there are hundreds of devices, which there shouldn't be, I think this is fine. @alandau you ok moving forward with this?

.collect();
Comment thread
gurasinghMS marked this conversation as resolved.
for (pci_id, client) in devices_to_save.iter_mut() {
nvme_disks.push(NvmeSavedDiskConfig {
Expand All @@ -449,14 +470,19 @@ impl NvmeManagerWorker {

for disk in &saved_state.nvme_disks {
let pci_id = disk.pci_id.clone();

// Read the PCI vendor/device IDs once and cache the resulting
// keepalive compatibility flag on the `NvmeDriverManager`.
let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id);
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alandau do you know if the device would be resolved at this point? Like if the device has not arrived back and reading the config space leads to a stall, it could impact boot times (which is no better than not doing keepalive at all)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't. But I remember at some point we ask VfioDevice to restore and it has to go to /sys to do its magic. You can try seeing if it's doing it past this point or not.


let nvme_driver = self
.context
.nvme_driver_spawner
.create_driver(
&self.context.driver_source,
&pci_id,
saved_state.cpu_count,
save_restore_supported,
save_restore_supported && keepalive_compatible,
Some(&disk.driver_state),
)
.await?;
Expand All @@ -468,6 +494,7 @@ impl NvmeManagerWorker {
&pci_id,
self.context.vp_count,
true, // save_restore_supported is always `true` when restoring.
keepalive_compatible,
Some(nvme_driver),
self.context.nvme_driver_spawner.clone(),
)?,
Expand Down Expand Up @@ -1068,9 +1095,16 @@ mod tests {
));

// Create a driver manager
let driver_manager =
NvmeDriverManager::new(&driver_source, "0000:00:04.0", 4, false, None, spawner)
.unwrap();
let driver_manager = NvmeDriverManager::new(
&driver_source,
"0000:00:04.0",
4,
false,
false,
None,
spawner,
)
.unwrap();

let client = driver_manager.client().clone();

Expand Down Expand Up @@ -1154,6 +1188,7 @@ mod tests {
"0000:00:05.0",
4,
true, // save_restore_supported
true,
None,
spawner,
)
Expand Down
50 changes: 50 additions & 0 deletions openhcl/underhill_core/src/nvme_manager/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ pub struct NamespaceError {
source: NvmeSpawnerError,
}

/// PCI vendor ID, as it appears in the sysfs `vendor` file (e.g. `0x0100`),
/// for NVMe devices that are incompatible with keepalive.
const KEEPALIVE_INCOMPATIBLE_VENDOR_ID: &str = "0x1414";

/// PCI device ID, as it appears in the sysfs `device` file (e.g. `0x0100`),
/// for NVMe devices that are incompatible with keepalive.
const KEEPALIVE_INCOMPATIBLE_DEVICE_ID: &str = "0xb111";

#[derive(Debug, Error)]
pub enum NvmeSpawnerError {
#[error("failed to initialize vfio device")]
Expand Down Expand Up @@ -111,3 +119,45 @@ pub trait CreateNvmeDriver: Inspect + Send + Sync {
saved_state: Option<&nvme_driver::save_restore::NvmeDriverSavedState>,
) -> Result<Box<dyn NvmeDevice>, NvmeSpawnerError>;
}

/// Returns whether the given PCI device is compatible with NVMe keepalive.
pub(crate) fn is_nvme_keepalive_compatible(pci_id: &str) -> bool {
match read_pci_vendor_device_ids(&pci_id) {
Ok((vendor_id, device_id)) => {
vendor_id != KEEPALIVE_INCOMPATIBLE_VENDOR_ID
|| device_id != KEEPALIVE_INCOMPATIBLE_DEVICE_ID
}
Err(err) => {
tracing::warn!(
pci_id = %pci_id,
error = err.as_ref() as &dyn std::error::Error,
"failed to read PCI vendor/device IDs; treating device as not keepalive-compatible"
);
false
}
}
}

/// Reads the sysfs `vendor` and `device` files for the given PCI device,
/// returning the trimmed contents (e.g. `"0x0100"`).
///
/// Callers should invoke this once per device and cache the result, since
/// the values do not change for the lifetime of the device.
fn read_pci_vendor_device_ids(pci_id: &str) -> anyhow::Result<(String, String)> {
let devpath = std::path::Path::new("/sys/bus/pci/devices").join(pci_id);
let vendor = fs_err::read_to_string(devpath.join("vendor"))?
.trim_end()
.to_owned();
let device = fs_err::read_to_string(devpath.join("device"))?
.trim_end()
.to_owned();

tracing::info!(
pci_id = %pci_id,
vendor = %vendor,
device = %device,
"read PCI vendor/device IDs"
);

Ok((vendor, device))
}
Comment thread
gurasinghMS marked this conversation as resolved.
68 changes: 68 additions & 0 deletions vm/devices/storage/nvme_resources/src/fault.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,42 @@ pub struct CommandMatch {
pub mask: [u8; 64],
}

/// A fault configuration for overriding the device's hardware configuration
/// (i.e. the values reported in PCI configuration space).
///
/// When a field is `Some`, its value replaces the corresponding field reported
/// by the NVMe fault controller's PCI hardware IDs. When `None`, the real
/// hardware ID value is used unchanged. Unlike most other faults in this
/// module, hardware config faults are applied at device construction time
/// and are therefore not gated by the `fault_active` flag.
///
/// # Example
/// Override the Vendor ID and Device ID reported in PCI config space.
/// ```no_run
/// use mesh::CellUpdater;
/// use nvme_resources::fault::FaultConfiguration;
/// use nvme_resources::fault::HardwareConfigFaultConfig;
///
/// pub fn hardware_config_fault() -> FaultConfiguration {
/// let mut fault_start_updater = CellUpdater::new(false);
/// FaultConfiguration::new(fault_start_updater.cell())
/// .with_hardware_config_fault(
/// HardwareConfigFaultConfig::new()
/// .with_vendor_id(0x1414)
/// .with_device_id(0x00a9),
/// )
/// }
/// ```
#[derive(Debug, Copy, Clone, Default, MeshPayload)]
pub struct HardwareConfigFaultConfig {
/// Override for the device's PCI Vendor ID. When `None`, the real Vendor
/// ID is used.
pub vendor_id: Option<u16>,
/// Override for the device's PCI Device ID. When `None`, the real Device
/// ID is used.
pub device_id: Option<u16>,
}

/// Fault configuration for the NVMe fault controller.
///
/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
Expand Down Expand Up @@ -390,6 +426,8 @@ pub struct FaultConfiguration {
pub namespace_fault: NamespaceFaultConfig,
/// Fault to apply to all IO queues
pub io_fault: Arc<IoQueueFaultConfig>,
/// Fault to apply to the Hardware Configuration
pub hardware_config_fault: Option<HardwareConfigFaultConfig>,
}

impl FaultConfiguration {
Expand All @@ -404,6 +442,7 @@ impl FaultConfiguration {
pci_fault: Some(PciFaultConfig::new()),
namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
hardware_config_fault: None,
}
}

Expand All @@ -430,6 +469,35 @@ impl FaultConfiguration {
self.namespace_fault = namespace_fault;
self
}

/// Add a hardware config fault configuration to the fault configuration
pub fn with_hardware_config_fault(
mut self,
hardware_config_fault: HardwareConfigFaultConfig,
) -> Self {
self.hardware_config_fault = Some(hardware_config_fault);
self
}
}

impl HardwareConfigFaultConfig {
/// Create a new no-op hardware config fault configuration. All fields
/// default to `None`, meaning the real hardware values are used.
pub fn new() -> Self {
Self::default()
}

/// Override the PCI Vendor ID reported by the device.
pub fn with_vendor_id(mut self, vendor_id: u16) -> Self {
self.vendor_id = Some(vendor_id);
self
}

/// Override the PCI Device ID reported by the device.
pub fn with_device_id(mut self, device_id: u16) -> Self {
self.device_id = Some(device_id);
self
}
}

impl PciFaultConfig {
Expand Down
15 changes: 13 additions & 2 deletions vm/devices/storage/nvme_test/src/pci.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,21 @@ impl NvmeFaultController {
BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
);

// Apply any hardware-config fault overrides for the IDs reported in
// PCI configuration space, falling back to the real values when no
// override is configured.
let hardware_config_fault = fault_configuration.hardware_config_fault.take();
let vendor_id = hardware_config_fault
.and_then(|f| f.vendor_id)
.unwrap_or(VENDOR_ID);
let device_id = hardware_config_fault
.and_then(|f| f.device_id)
.unwrap_or(0x00a9);
Comment on lines +143 to +149

let cfg_space = ConfigSpaceType0Emulator::new(
HardwareIds {
vendor_id: VENDOR_ID,
device_id: 0x00a9,
vendor_id,
device_id,
revision_id: 0,
prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
Expand Down
Loading
Loading