-
Notifications
You must be signed in to change notification settings - Fork 191
nvme_driver: only support keepalive for a certain class of devices #3415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0a9f871
aec5033
1c8bc55
30b5bad
6076ef6
1e19679
fc20c57
4c15d5d
4c9456c
dbab214
6412280
dea6e9e
7d4c3e2
b60a859
0af4f8e
0a96c9d
4efda83
88aed69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ use crate::nvme_manager::CreateNvmeDriver; | |
| use crate::nvme_manager::device::NvmeDriverManager; | ||
| use crate::nvme_manager::device::NvmeDriverManagerClient; | ||
| use crate::nvme_manager::device::NvmeDriverShutdownOptions; | ||
| use crate::nvme_manager::is_nvme_keepalive_compatible; | ||
| use crate::nvme_manager::save_restore::NvmeManagerSavedState; | ||
| use crate::nvme_manager::save_restore::NvmeSavedDiskConfig; | ||
| use crate::servicing::NvmeSavedState; | ||
|
|
@@ -301,12 +302,17 @@ impl NvmeManagerWorker { | |
|
|
||
| async { | ||
| join_all(devices_to_shutdown.into_iter().map(|(pci_id, driver)| { | ||
| let keepalive_compatible = driver.keepalive_compatible(); | ||
| driver | ||
| .shutdown(NvmeDriverShutdownOptions { | ||
| // nvme_keepalive is received from host but it is only valid | ||
| // when memory pool allocator supports save/restore. | ||
| do_not_reset: nvme_keepalive && self.context.save_restore_supported, | ||
| skip_device_shutdown: nvme_keepalive && self.context.save_restore_supported, | ||
| do_not_reset: nvme_keepalive | ||
| && self.context.save_restore_supported | ||
| && keepalive_compatible, | ||
| skip_device_shutdown: nvme_keepalive | ||
| && self.context.save_restore_supported | ||
| && keepalive_compatible, | ||
| }) | ||
| .instrument(tracing::info_span!("shutdown_nvme_driver", %pci_id)) | ||
| })) | ||
|
|
@@ -355,11 +361,14 @@ impl NvmeManagerWorker { | |
| match guard.entry(pci_id.clone()) { | ||
| hash_map::Entry::Occupied(_) => unreachable!(), // We checked above that this entry does not exist. | ||
| hash_map::Entry::Vacant(entry) => { | ||
| let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); | ||
|
|
||
|
Comment on lines
361
to
+365
|
||
| let driver = NvmeDriverManager::new( | ||
| &context.driver_source, | ||
| &pci_id, | ||
| context.vp_count, | ||
| context.save_restore_supported, | ||
| context.save_restore_supported && keepalive_compatible, | ||
|
Comment on lines
+364
to
+370
|
||
| keepalive_compatible, | ||
| None, // No device yet, | ||
| context.nvme_driver_spawner.clone(), | ||
| )?; | ||
|
|
@@ -419,12 +428,24 @@ impl NvmeManagerWorker { | |
| /// Saves NVMe device's states into buffer during servicing. | ||
| pub async fn save(&mut self) -> anyhow::Result<NvmeManagerSavedState> { | ||
| let mut nvme_disks: Vec<NvmeSavedDiskConfig> = Vec::new(); | ||
| // Only save state for devices known to be keepalive-compatible. | ||
| let mut devices_to_save: HashMap<String, NvmeDriverManagerClient> = self | ||
| .context | ||
| .devices | ||
| .write() | ||
| .iter() | ||
|
Comment on lines
432
to
436
|
||
| .map(|(pci_id, driver)| (pci_id.clone(), driver.client().clone())) | ||
| .filter_map(|(pci_id, driver)| { | ||
| if driver.keepalive_compatible() { | ||
| Some((pci_id.clone(), driver.client().clone())) | ||
| } else { | ||
| tracing::info!( | ||
| %pci_id, | ||
| "skipping save of nvme device; \ | ||
| keepalive disabled for this device" | ||
| ); | ||
| None | ||
| } | ||
| }) | ||
|
gurasinghMS marked this conversation as resolved.
Comment on lines
+437
to
+448
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes sense. I don't know how long the logging prolongs the time inside the lock (which lock? I don't see locks in this function), so don't know if we should fix this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know what lock it is talking about either. But unless there are hundreds of devices, which there shouldn't be, I think this is fine. @alandau you ok moving forward with this? |
||
| .collect(); | ||
|
gurasinghMS marked this conversation as resolved.
|
||
| for (pci_id, client) in devices_to_save.iter_mut() { | ||
| nvme_disks.push(NvmeSavedDiskConfig { | ||
|
|
@@ -449,14 +470,19 @@ impl NvmeManagerWorker { | |
|
|
||
| for disk in &saved_state.nvme_disks { | ||
| let pci_id = disk.pci_id.clone(); | ||
|
|
||
| // Read the PCI vendor/device IDs once and cache the resulting | ||
| // keepalive compatibility flag on the `NvmeDriverManager`. | ||
| let keepalive_compatible = is_nvme_keepalive_compatible(&pci_id); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alandau do you know if the device would be resolved at this point? Like if the device has not arrived back and reading the config space leads to a stall, it could impact boot times (which is no better than not doing keepalive at all)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't. But I remember at some point we ask VfioDevice to restore and it has to go to /sys to do its magic. You can try seeing if it's doing it past this point or not. |
||
|
|
||
| let nvme_driver = self | ||
| .context | ||
| .nvme_driver_spawner | ||
| .create_driver( | ||
| &self.context.driver_source, | ||
| &pci_id, | ||
| saved_state.cpu_count, | ||
| save_restore_supported, | ||
| save_restore_supported && keepalive_compatible, | ||
| Some(&disk.driver_state), | ||
| ) | ||
| .await?; | ||
|
|
@@ -468,6 +494,7 @@ impl NvmeManagerWorker { | |
| &pci_id, | ||
| self.context.vp_count, | ||
| true, // save_restore_supported is always `true` when restoring. | ||
| keepalive_compatible, | ||
| Some(nvme_driver), | ||
| self.context.nvme_driver_spawner.clone(), | ||
| )?, | ||
|
|
@@ -1068,9 +1095,16 @@ mod tests { | |
| )); | ||
|
|
||
| // Create a driver manager | ||
| let driver_manager = | ||
| NvmeDriverManager::new(&driver_source, "0000:00:04.0", 4, false, None, spawner) | ||
| .unwrap(); | ||
| let driver_manager = NvmeDriverManager::new( | ||
| &driver_source, | ||
| "0000:00:04.0", | ||
| 4, | ||
| false, | ||
| false, | ||
| None, | ||
| spawner, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| let client = driver_manager.client().clone(); | ||
|
|
||
|
|
@@ -1154,6 +1188,7 @@ mod tests { | |
| "0000:00:05.0", | ||
| 4, | ||
| true, // save_restore_supported | ||
| true, | ||
| None, | ||
| spawner, | ||
| ) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.