From 55855322d5ec4eb467099a619726d2e1a4de362e Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Wed, 18 Feb 2026 15:03:40 +0000 Subject: [PATCH 01/53] fix: open files with `read` modifier The commit 7ee43cc6cbe915e1a5292bd9d5f6596ed4f6a9a9 ("fix: Support creating file with open_file_nonblock") did modify the file opening utility function by adding `open` option, but it also removed the `read` option from it. This causes an error during metrics and logs file initialization code if the the file is a FIFO and there are no readers already reading from it. This is because `open` returns `ENXIO` when opening a FIFO write-only as described in the man page: ``` ENXIO O_NONBLOCK | O_WRONLY is set, the named file is a FIFO, and no process has the FIFO open for reading. ``` Fix is just a partial revert of the part that changed the file opening logic by re-introducing same `open_file_nonblock` as it was before but with added `create` flag. Signed-off-by: Egor Lazarchuk --- src/vmm/src/device_manager/mod.rs | 4 ++-- src/vmm/src/logger/logging.rs | 4 ++-- src/vmm/src/utils/mod.rs | 12 +++++++----- src/vmm/src/vmm_config/metrics.rs | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index fc245e05539..2a556b342b1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -35,7 +35,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::resources::VmResources; use crate::snapshot::Persist; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; use crate::vstate::bus::BusError; use crate::vstate::memory::GuestMemoryMmap; use crate::{EmulateSerialInitError, EventManager, Vm}; @@ -125,7 +125,7 @@ impl DeviceManager { output: Option<&PathBuf>, ) -> Result>, std::io::Error> { let (serial_in, serial_out) = match output { - Some(path) => (None, open_file_write_nonblock(path).map(SerialOut::File)?), + Some(path) => (None, open_file_nonblock(path).map(SerialOut::File)?), None => { Self::set_stdout_nonblocking(); diff --git a/src/vmm/src/logger/logging.rs b/src/vmm/src/logger/logging.rs index 8afdf976ffb..a108fb3474b 100644 --- a/src/vmm/src/logger/logging.rs +++ b/src/vmm/src/logger/logging.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Deserializer, Serialize}; use utils::time::LocalTime; use super::metrics::{IncMetric, METRICS}; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; /// Default level filter for logger matching the swagger specification /// (`src/firecracker/swagger/firecracker.yaml`). @@ -62,7 +62,7 @@ impl Logger { ); if let Some(log_path) = config.log_path { - let file = open_file_write_nonblock(&log_path).map_err(LoggerUpdateError)?; + let file = open_file_nonblock(&log_path).map_err(LoggerUpdateError)?; guard.target = Some(file); }; diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 1288abef0ba..97762ae97d7 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -76,14 +76,16 @@ pub const fn align_down(addr: u64, align: u64) -> u64 { addr & !(align - 1) } -/// Create and open a File for writing to it. -/// In case we open a FIFO, in order to not block the instance if nobody is consuming the message -/// that is flushed to it, we are opening it with `O_NONBLOCK` flag. -/// In this case, writing to a pipe will start failing when reaching 64K of unconsumed content. -pub fn open_file_write_nonblock(path: &Path) -> Result { +/// Create and open a file for both reading and writing to it with a O_NONBLOCK flag. +/// In case we open a FIFO, we need all READ, WRITE and O_NONBLOCK in order to not block the process +/// if nobody is consuming the message. Otherwise opening the FIFO with only WRITE and O_NONBLOCK +/// will fail with ENXIO if there is no reader already attached to it. +/// NOTE: writing to a pipe will start failing when reaching 64K of unconsumed content. +pub fn open_file_nonblock(path: &Path) -> Result { OpenOptions::new() .custom_flags(O_NONBLOCK) .create(true) + .read(true) .write(true) .open(path) } diff --git a/src/vmm/src/vmm_config/metrics.rs b/src/vmm/src/vmm_config/metrics.rs index 9d44c35f6a3..38001661fc6 100644 --- a/src/vmm/src/vmm_config/metrics.rs +++ b/src/vmm/src/vmm_config/metrics.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; use crate::logger::{FcLineWriter, METRICS}; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; /// Strongly typed structure used to describe the metrics system. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] @@ -26,7 +26,7 @@ pub enum MetricsConfigError { /// Configures the metrics as described in `metrics_cfg`. pub fn init_metrics(metrics_cfg: MetricsConfig) -> Result<(), MetricsConfigError> { let writer = FcLineWriter::new( - open_file_write_nonblock(&metrics_cfg.metrics_path) + open_file_nonblock(&metrics_cfg.metrics_path) .map_err(|err| MetricsConfigError::InitializationFailure(err.to_string()))?, ); METRICS From 51c86926005658dbdfc8b47573bdf9409f26ded2 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Wed, 18 Feb 2026 15:07:32 +0000 Subject: [PATCH 02/53] cleanup: remove redundant std::result::Result imports No functional change. Just cleanup. Signed-off-by: Egor Lazarchuk --- src/vmm/src/utils/mod.rs | 1 - src/vmm/src/vstate/interrupts.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 97762ae97d7..4cc7640fd74 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -14,7 +14,6 @@ use std::fs::{File, OpenOptions}; use std::num::Wrapping; use std::os::unix::fs::OpenOptionsExt; use std::path::Path; -use std::result::Result; use libc::O_NONBLOCK; diff --git a/src/vmm/src/vstate/interrupts.rs b/src/vmm/src/vstate/interrupts.rs index 5246144d8f6..852086b3e6d 100644 --- a/src/vmm/src/vstate/interrupts.rs +++ b/src/vmm/src/vstate/interrupts.rs @@ -187,7 +187,7 @@ impl<'a> Persist<'a> for MsixVectorGroup { fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, - ) -> std::result::Result { + ) -> Result { let mut vectors = Vec::with_capacity(state.len()); for gsi in state { From 3a12ebff885cd8b644299d6a546bbd1ea91a52ca Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Thu, 19 Feb 2026 11:19:04 +0000 Subject: [PATCH 03/53] changelog: add note about FIFO fix Add note about fix in the #5698 PR. Signed-off-by: Egor Lazarchuk --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 080adf57257..ad2aa0b5560 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.14.2] + +### Fixed + +- [#5698](https://github.com/firecracker-microvm/firecracker/pull/5698): Fixed + the possible ENXIO error which could occur during file open operation if the + underlying file is FIFO without active readers already attached. + ## [1.14.1] ### Changed From 07fd78472faddaf8b7a43169788dc6f537b8ce7c Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Tue, 24 Feb 2026 10:23:27 +0000 Subject: [PATCH 04/53] vmm: memory.rs: Do not panic in dump_dirty() In GuestMemoryExtension::dump_dirty() and GuestMemorySlot::dump_dirty() there are several unwraps which will cause Firecracker to panic. Instead of panicking simply propagate errors up the call chain to mark the snapshot operation as failed. To make error handling less verbose and more descriptive make GuestMemorySlot::dump_dirty() return MemoryError instead of GuestMemoryError and define new error types for MemoryError. Signed-off-by: Ilias Stamatis --- src/vmm/src/vstate/memory.rs | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index ef160d9e918..35eea98abe9 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -59,6 +59,20 @@ pub enum MemoryError { Unaligned, /// Error protecting memory slot: {0} Mprotect(std::io::Error), + /// Size too large for i64 conversion + SlotSizeTooLarge, + /// Dirty bitmap not found for memory slot {0} + DirtyBitmapNotFound(u32), + /// Seek error: {0} + SeekError(std::io::Error), + /// Volatile memory error: {0} + VolatileMemoryError(vm_memory::VolatileMemoryError), +} + +impl From for MemoryError { + fn from(e: vm_memory::VolatileMemoryError) -> Self { + MemoryError::VolatileMemoryError(e) + } } /// Type of the guest region @@ -121,7 +135,7 @@ impl<'a> GuestMemorySlot<'a> { writer: &mut T, kvm_bitmap: &[u64], page_size: usize, - ) -> Result<(), GuestMemoryError> { + ) -> Result<(), MemoryError> { let firecracker_bitmap = self.slice.bitmap(); let mut write_size = 0; let mut skip_size = 0; @@ -137,9 +151,12 @@ impl<'a> GuestMemorySlot<'a> { // We are at the start of a new batch of dirty pages. if skip_size > 0 { // Seek forward over the unmodified pages. + let offset = skip_size + .try_into() + .map_err(|_| MemoryError::SlotSizeTooLarge)?; writer - .seek(SeekFrom::Current(skip_size.try_into().unwrap())) - .unwrap(); + .seek(SeekFrom::Current(offset)) + .map_err(MemoryError::SeekError)?; dirty_batch_start = page_offset; skip_size = 0; } @@ -668,10 +685,15 @@ impl GuestMemoryExtension for GuestMemoryMmap { .flat_map(|region| region.slots()) .try_for_each(|(mem_slot, plugged)| { if !plugged { - let ilen = i64::try_from(mem_slot.slice.len()).unwrap(); - writer.seek(SeekFrom::Current(ilen)).unwrap(); + let ilen = i64::try_from(mem_slot.slice.len()) + .map_err(|_| MemoryError::SlotSizeTooLarge)?; + writer + .seek(SeekFrom::Current(ilen)) + .map_err(MemoryError::SeekError)?; } else { - let kvm_bitmap = dirty_bitmap.get(&mem_slot.slot).unwrap(); + let kvm_bitmap = dirty_bitmap + .get(&mem_slot.slot) + .ok_or(MemoryError::DirtyBitmapNotFound(mem_slot.slot))?; mem_slot.dump_dirty(writer, kvm_bitmap, page_size)?; } Ok(()) @@ -683,7 +705,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { self.reset_dirty(); } - write_result.map_err(MemoryError::WriteMemory) + write_result } /// Resets all the memory region bitmaps From 881e4b8b073c1e3438ce0a91eb3670379c524e0f Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Thu, 19 Feb 2026 18:46:44 +0000 Subject: [PATCH 05/53] tests: memory.rs: Improve documentation in test_dump_dirty() The comment in test_dump_dirty() before the last snapshot is taken says: // First region pages: [dirty, clean] However, the following code line updates the second page rather than the first one, so in reality the state of the bitmap is [clean, dirty]. Fix the comment. Additionally, the "Dump only the dirty pages." comment further above is misleading. Since we write to all pages of all regions, the Firecracker bitmap (rather than the KVM bitmap) will be all 1s, therefore the first snapshot taken by this function will dump all memory pages. Rename 'dirty_bitmap' to 'kvm_dirty_bitmap' to make it clear which bitmap it refers to and add comments about the state of the Firecracker bitmap too. To make things more obvious for the reader re-configure the state of the KVM dirty bitmap next to the relevant comment, just before we take the second snapshot. Since we are in the neighbourhood, rename 'expected_first_region' to 'expected_file_contents' since it clearly stores the contents of both regions rather than the first one only. No functional change intended. Signed-off-by: Ilias Stamatis --- src/vmm/src/vstate/memory.rs | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 35eea98abe9..eb94a40b74c 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -1145,17 +1145,23 @@ mod tests { .write(&second_region, region_2_address) .unwrap(); + // Firecracker Dirty Bitmap after the writes: + // First region pages: [dirty, dirty] + // Second region pages: [dirty, dirty] + let memory_state = guest_memory.describe(); - // Dump only the dirty pages. + // KVM dirty bitmap: // First region pages: [dirty, clean] // Second region pages: [clean, dirty] - let mut dirty_bitmap: DirtyBitmap = HashMap::new(); - dirty_bitmap.insert(0, vec![0b01]); - dirty_bitmap.insert(1, vec![0b10]); + let mut kvm_dirty_bitmap: DirtyBitmap = HashMap::new(); + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); let mut file = TempFile::new().unwrap().into_file(); - guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); + guest_memory + .dump_dirty(&mut file, &kvm_dirty_bitmap) + .unwrap(); // We can restore from this because this is the first dirty dump. let restored_guest_memory = @@ -1180,18 +1186,25 @@ mod tests { let ones = vec![1u8; page_size]; let twos = vec![2u8; page_size]; - // Firecracker Bitmap - // First region pages: [dirty, clean] + // Firecracker Dirty Bitmap: + // First region pages: [clean, dirty] // Second region pages: [clean, clean] guest_memory .write(&twos, GuestAddress(page_size as u64)) .unwrap(); + // KVM dirty bitmap: + // First region pages: [dirty, clean] + // Second region pages: [clean, dirty] + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); - guest_memory.dump_dirty(&mut reader, &dirty_bitmap).unwrap(); + guest_memory + .dump_dirty(&mut reader, &kvm_dirty_bitmap) + .unwrap(); // Check that only the dirty regions are dumped. let mut diff_file_content = Vec::new(); - let expected_first_region = [ + let expected_file_contents = [ ones.as_slice(), twos.as_slice(), zeros.as_slice(), @@ -1200,7 +1213,7 @@ mod tests { .concat(); reader.seek(SeekFrom::Start(0)).unwrap(); reader.read_to_end(&mut diff_file_content).unwrap(); - assert_eq!(expected_first_region, diff_file_content); + assert_eq!(expected_file_contents, diff_file_content); } #[test] From 0b8725e7c0e7296568404ea88fb01bfa73cfa9b3 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Tue, 24 Feb 2026 12:29:34 +0000 Subject: [PATCH 06/53] vmm: memory.rs: Fail dump_dirty() when bitmap size is wrong It is possible that the dirty bitmap dump_dirty() receives is larger or smaller than the slot size. Return an error in that case. Additionally, the inner loop in that function always iterates over the 0..64 range. Typically the region size won't be a multiple of 64, so we need to make sure that we break after we check the last bit that corresponds to the last page of the region. Extend the test_dump_dirty() test case to exercise the new code paths by supplying wrongly sized bitmaps to the function. Signed-off-by: Ilias Stamatis --- src/vmm/src/vstate/memory.rs | 43 ++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index eb94a40b74c..6028336fe01 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -63,6 +63,10 @@ pub enum MemoryError { SlotSizeTooLarge, /// Dirty bitmap not found for memory slot {0} DirtyBitmapNotFound(u32), + /// Dirty bitmap is larger than the slot size + DirtyBitmapTooLarge, + /// Dirty bitmap is smaller than the slot size + DirtyBitmapTooSmall, /// Seek error: {0} SeekError(std::io::Error), /// Volatile memory error: {0} @@ -141,12 +145,31 @@ impl<'a> GuestMemorySlot<'a> { let mut skip_size = 0; let mut dirty_batch_start = 0; + let expected_bitmap_array_len = (self.slice.len() / page_size).div_ceil(64); + if kvm_bitmap.len() > expected_bitmap_array_len { + return Err(MemoryError::DirtyBitmapTooLarge); + } else if kvm_bitmap.len() < expected_bitmap_array_len { + return Err(MemoryError::DirtyBitmapTooSmall); + } + for (i, v) in kvm_bitmap.iter().enumerate() { for j in 0..64 { let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; let page_offset = ((i * 64) + j) * page_size; let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + // We process 64 pages at a time, however the number of pages + // in the slot might not be a multiple of 64. We need to break + // once we go past the last page that is actually part of the + // region. + if page_offset >= self.slice.len() { + // Ensure there are no more dirty bits after this point + if (v >> j) != 0 { + return Err(MemoryError::DirtyBitmapTooLarge); + } + break; + } + if is_kvm_page_dirty || is_firecracker_page_dirty { // We are at the start of a new batch of dirty pages. if skip_size > 0 { @@ -1214,6 +1237,26 @@ mod tests { reader.seek(SeekFrom::Start(0)).unwrap(); reader.read_to_end(&mut diff_file_content).unwrap(); assert_eq!(expected_file_contents, diff_file_content); + + // Test with bitmaps that are too large or too small + kvm_dirty_bitmap.insert(0, vec![0b1, 0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooLarge) + )); + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b110]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooLarge) + )); + kvm_dirty_bitmap.insert(0, vec![]); + kvm_dirty_bitmap.insert(1, vec![0b10]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooSmall) + )); } #[test] From 5156f7d3b0a9d76734b6db0ab2e6937076eaf4a4 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Thu, 19 Feb 2026 18:13:37 +0000 Subject: [PATCH 07/53] fix(diff-snapshot): Advance file cursor when trailing pages are clean GuestMemorySlot::dump_dirty() does not advance the file cursor when the pages at the end of the memory region are clean. This causes the next slot to start writing at an incorrect offset and corrupting the contents of the previous region. Fix this by always advancing the cursor at the end of dump_dirty(). As per the original report from EJ Ciramella this only affects VMs with more than one memory slots and it can cause Firecracker or the guest to crash when loading a corrupted snapshot. Fixes: 6c4c1bf5b857f9 ("feat(mem): introduce KVM slots per GuestMemoryRegion") Reported-by: EJ Ciramella Suggested-by: EJ Ciramella Signed-off-by: Ilias Stamatis --- src/vmm/src/vstate/memory.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 6028336fe01..7cf3aee019e 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -201,6 +201,14 @@ impl<'a> GuestMemorySlot<'a> { writer.write_all_volatile(&self.slice.subslice(dirty_batch_start, write_size)?)?; } + // Advance the cursor even if the trailing pages are clean, so that the + // next slot starts writing at the correct offset. + if skip_size > 0 { + writer + .seek(SeekFrom::Current(skip_size.try_into().unwrap())) + .map_err(MemoryError::SeekError)?; + } + Ok(()) } From beba51f71f5c5d4c605285df3383027c3c16b2b7 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Fri, 20 Feb 2026 10:44:48 +0000 Subject: [PATCH 08/53] tests: memory.rs: Extend test_dump_dirty() with trailing clean case The test_dump_dirty() test currently checks the file contents of a differential snapshot where the last page of both memory regions is dirty. Extend this test to also test the case where the last page of both regions is clean. Additionally, check that the logical size of the resulting file is different than the physical size due to the holes representing clean pages. Finally, make sure that if the KVM dirty bitmap is larger than the region size, the extra bits are ignored. Signed-off-by: Ilias Stamatis --- src/vmm/src/vstate/memory.rs | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 7cf3aee019e..9b62152c4b8 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -867,6 +867,7 @@ mod tests { use std::collections::HashMap; use std::io::{Read, Seek, Write}; + use std::os::unix::fs::MetadataExt; use vmm_sys_util::tempfile::TempFile; @@ -1246,6 +1247,50 @@ mod tests { reader.read_to_end(&mut diff_file_content).unwrap(); assert_eq!(expected_file_contents, diff_file_content); + // Take a 3rd snapshot + + // Firecracker Dirty Bitmap: + // First region pages: [dirty, clean] + // Second region pages: [dirty, clean] + guest_memory.write(&twos, region_1_address).unwrap(); + guest_memory.write(&ones, region_2_address).unwrap(); + // KVM dirty bitmap: + // First region pages: [clean, clean] + // Second region pages: [clean, clean] + kvm_dirty_bitmap.insert(0, vec![0b00]); + kvm_dirty_bitmap.insert(1, vec![0b00]); + + let file = TempFile::new().unwrap(); + let logical_size = page_size as u64 * 4; + file.as_file().set_len(logical_size).unwrap(); + + let mut reader = file.into_file(); + guest_memory + .dump_dirty(&mut reader, &kvm_dirty_bitmap) + .unwrap(); + + // Check that only the dirty regions are dumped. + let mut diff_file_content = Vec::new(); + // The resulting file is a sparse file with holes. + let expected_file_contents = [ + twos.as_slice(), + zeros.as_slice(), // hole + ones.as_slice(), + zeros.as_slice(), // hole + ] + .concat(); + reader.seek(SeekFrom::Start(0)).unwrap(); + reader.read_to_end(&mut diff_file_content).unwrap(); + + assert_eq!(expected_file_contents, diff_file_content); + + // Make sure that only 2 of the pages are written in the file and the + // other two are holes. + let metadata = reader.metadata().unwrap(); + let physical_size = metadata.blocks() * 512; + assert_eq!(physical_size, 2 * page_size as u64); + assert_ne!(physical_size, logical_size); + // Test with bitmaps that are too large or too small kvm_dirty_bitmap.insert(0, vec![0b1, 0b01]); kvm_dirty_bitmap.insert(1, vec![0b10]); From 70ecd2afdc097d2b689e07fe7b703cf9fe530cbd Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Fri, 20 Feb 2026 11:01:22 +0000 Subject: [PATCH 09/53] tests: integration: Make test_diff_snapshot_overlay test multi-slot VMs The test_diff_snapshot_overlay() case tests differential snapshots on VMs that have a single memory slot since basic_config() uses a 256MiB memory size by default. Parametrize the test so that it's repeated for both 256MiB and 4096MiB sizes. On x86 this will create 2 memory slots and hence test a different scenario. Suggested-by: Riccardo Mancini Signed-off-by: Ilias Stamatis --- tests/integration_tests/functional/test_snapshot_basic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index bd9f1ec0d9b..3695941fc5d 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -410,14 +410,15 @@ def test_create_large_diff_snapshot(uvm_plain): # process would have been taken down. -def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory): +@pytest.mark.parametrize("mem_size", [256, 4096]) +def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory, mem_size): """ Tests that if we take a diff snapshot and direct firecracker to write it on top of an existing snapshot file, it will successfully merge them. """ basevm = uvm_plain_any basevm.spawn() - basevm.basic_config(track_dirty_pages=True) + basevm.basic_config(track_dirty_pages=True, mem_size_mib=mem_size) basevm.add_net_iface() basevm.start() From cdbcdd680998b71ad352735352936eb1aeab0ce7 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 25 Feb 2026 10:00:54 +0000 Subject: [PATCH 10/53] CHANGELOG: 1.14.2: Mention diff snapshot memory corruption bugfix Update v1.14.2 CHANGELOG mentioning the fix for https://github.com/firecracker-microvm/firecracker/pull/5705 Signed-off-by: Ilias Stamatis --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad2aa0b5560..4f8899c9464 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ and this project adheres to - [#5698](https://github.com/firecracker-microvm/firecracker/pull/5698): Fixed the possible ENXIO error which could occur during file open operation if the underlying file is FIFO without active readers already attached. +- [#5705](https://github.com/firecracker-microvm/firecracker/pull/5705): Fixed a + bug that caused Firecracker to corrupt the memory files of differential + snapshots for VMs with multiple memory slots. This affected VMs using memory + hot-plugging or any x86 VMs with a memory size larger than 3GiB. ## [1.14.1] From ba56cd225234d0c09990e794d29921614ba4ae1e Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Tue, 24 Feb 2026 10:16:37 +0000 Subject: [PATCH 11/53] devtool: move ensure_kvm into devtool This one was only used in devtool, so move it there. No functional change. Signed-off-by: Egor Lazarchuk --- tools/devtool | 37 +++++++++++++++++++++++++++++++++++++ tools/functions | 12 ------------ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/tools/devtool b/tools/devtool index 4aaa37950e0..825cbbe34d7 100755 --- a/tools/devtool +++ b/tools/devtool @@ -583,6 +583,43 @@ ensure_ci_artifacts() { fi } +# Attempt to load the appropriate KVM module for the current platform. +# Returns 0 on success, non-zero on failure. +# +load_kvm() { + local arch + arch=$(uname -m) + + case "$arch" in + x86_64|i*86) + if grep -q "vmx" /proc/cpuinfo; then + modprobe kvm_intel || return 1 + elif grep -q "svm" /proc/cpuinfo; then + modprobe kvm_amd avic=1 || return 1 + else + return 1 + fi + ;; + aarch64|arm*) + modprobe kvm || return 1 + ;; + *) + return 1 + ;; + esac + + # Check /dev/kvm now exists + [[ -c /dev/kvm ]] +} + +# Check if /dev/kvm exists. Attempt to load the module if it doesn't. +# Exit if KVM is unavailable. Upon returning from this call, the caller +# can be certain /dev/kvm is available. +# +ensure_kvm() { + [[ -c /dev/kvm ]] || load_kvm || die "/dev/kvm not found. Aborting." +} + apply_linux_61_tweaks() { KV=$(uname -r) if [[ $KV != 6.1.* ]] || [ $(uname -m) != x86_64 ]; then diff --git a/tools/functions b/tools/functions index 90e75c251bd..5268aab5ebc 100644 --- a/tools/functions +++ b/tools/functions @@ -125,15 +125,3 @@ function validate_version { die "Invalid version number: $version. Version should not contain \`wip\` or \`dirty\`." fi } - -######################### -# Firecracker functions # -######################### - -# Check if /dev/kvm exists. Exit if it doesn't. -# Upon returning from this call, the caller can be certain /dev/kvm is -# available. -# -ensure_kvm() { - [[ -c /dev/kvm ]] || die "/dev/kvm not found. Aborting." -} From 18d533e49d2569745e1d3c0d19b397ab4e2ef636 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Wed, 18 Feb 2026 17:40:01 +0000 Subject: [PATCH 12/53] devtool: more robust logic for tweaking kvm module There are several issues with the current implementation which result in a spurious failures/timeouts in a CI. Additionally, the kvm update logic was duplicated in a couple of places. To resolve these issues, refactor the code a bit to move kvm loading/checking logic into one place and implement more robust locking mechanism. Signed-off-by: Egor Lazarchuk --- tools/devtool | 187 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 113 insertions(+), 74 deletions(-) diff --git a/tools/devtool b/tools/devtool index 825cbbe34d7..ee9299ea772 100755 --- a/tools/devtool +++ b/tools/devtool @@ -133,6 +133,9 @@ TARGET_PREFIX="$(uname -m)-unknown-linux-" # Container path to directory where we store built CI artifacts. CTR_CI_ARTIFACTS_PATH="${CTR_FC_ROOT_DIR}/resources/$(uname -m)" +# Lockfile used while modifying KVM modules +KVM_MODULE_LOCKFILE="/tmp/.kvm_module_lock" + # Check if Docker is available and exit if it's not. # Upon returning from this call, the caller can be certain Docker is available. # @@ -583,89 +586,127 @@ ensure_ci_artifacts() { fi } -# Attempt to load the appropriate KVM module for the current platform. -# Returns 0 on success, non-zero on failure. -# -load_kvm() { - local arch - arch=$(uname -m) - - case "$arch" in - x86_64|i*86) - if grep -q "vmx" /proc/cpuinfo; then - modprobe kvm_intel || return 1 - elif grep -q "svm" /proc/cpuinfo; then - modprobe kvm_amd avic=1 || return 1 - else - return 1 - fi - ;; - aarch64|arm*) - modprobe kvm || return 1 - ;; - *) +# Acquire the KVM module lock and run the given command. +# Uses flock with a timeout for safe, automatic lock management. +# Usage: with_kvm_module_lock [args...] +with_kvm_module_lock() { + local LOCK_TIMEOUT=120 + ( + if ! flock -w "$LOCK_TIMEOUT" 9; then + say_warn "Timed out waiting for KVM module lock after: ${LOCK_TIMEOUT}s" + exit 1 + fi + echo "Successfully acquired lock" + "$@" + ) 9>"$KVM_MODULE_LOCKFILE" +} + +# Reload KVM modules with the given vendor module and kvm params. +# Always enables avic=1 on AMD. Unloads first if already loaded. +# Usage: reload_kvm_modules [kvm_param...] +# e.g. reload_kvm_modules kvm_intel nx_huge_pages=never +reload_kvm_modules() { + local vendor_mod=$1; shift + + # Unload if already loaded + if lsmod | grep -qP "^kvm_(amd|intel)"; then + if ! sudo modprobe -r $vendor_mod kvm; then + say_warn "Failed to unload KVM modules (${vendor_mod}, kvm) (may be in use)" return 1 - ;; - esac + fi + fi - # Check /dev/kvm now exists - [[ -c /dev/kvm ]] + if ! sudo modprobe kvm "$@"; then + say_warn "Failed to load kvm module" + return 1 + fi + if [[ $vendor_mod == "kvm_amd" ]]; then + if ! sudo modprobe kvm_amd avic=1; then + say_warn "Failed to load kvm_amd module" + return 1 + fi + else + if ! sudo modprobe $vendor_mod; then + say_warn "Failed to load $vendor_mod module" + return 1 + fi + fi } -# Check if /dev/kvm exists. Attempt to load the module if it doesn't. -# Exit if KVM is unavailable. Upon returning from this call, the caller -# can be certain /dev/kvm is available. -# -ensure_kvm() { - [[ -c /dev/kvm ]] || load_kvm || die "/dev/kvm not found. Aborting." +# Determine the KVM vendor module for the current CPU. +kvm_vendor_mod() { + if grep -q "vmx" /proc/cpuinfo; then + echo kvm_intel + elif grep -q "svm" /proc/cpuinfo; then + echo kvm_amd + else + # aarch64 + echo kvm + fi } -apply_linux_61_tweaks() { - KV=$(uname -r) - if [[ $KV != 6.1.* ]] || [ $(uname -m) != x86_64 ]; then - return +# Ensure /dev/kvm is available and apply platform-specific KVM tweaks. +# - Loads KVM modules if not present +# - On Linux 6.1 x86_64: applies nx_huge_pages=never for non-vulnerable CPUs, +# checks favordynmods for vulnerable ones +# - On AMD: ensures AVIC is enabled +setup_kvm() { + local kernel_version=$(uname -r) + local arch=$(uname -m) + local vendor_mod=$(kvm_vendor_mod) + + local need_kvm_reload=0 + local kvm_extra_params=() + + # Load KVM if not already available + if [[ ! -c /dev/kvm ]]; then + need_kvm_reload=1 fi - say "Applying Linux 6.1 boot-time regression mitigations" - - KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') - ITLB_MULTIHIT=/sys/devices/system/cpu/vulnerabilities/itlb_multihit - NX_HUGEPAGES=/sys/module/kvm/parameters/nx_huge_pages - - # If m6a/m6i - if grep -q "Not affected" $ITLB_MULTIHIT; then - echo -e "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" - # we need a lock so another process is not running the same thing and to - # avoid race conditions. - lockfile="/tmp/.linux61_tweaks.lock" - set -C # noclobber - while true; do - if echo "$$" > "$lockfile"; then - echo "Successfully acquired lock" - if ! grep -q "never" $NX_HUGEPAGES; then - echo "Reloading KVM modules with nx_huge_pages=never" - sudo modprobe -r $KVM_VENDOR_MOD kvm - sudo modprobe kvm nx_huge_pages=never - sudo modprobe $KVM_VENDOR_MOD - fi - rm "$lockfile" - break - else - sleep 5s + + local itlb_multihit=/sys/devices/system/cpu/vulnerabilities/itlb_multihit + local nx_huge_pages=/sys/module/kvm/parameters/nx_huge_pages + # Linux 6.1 x86_64: mitigate boot-time regression + if [[ $kernel_version == 6.1.* ]] && [[ $arch == x86_64 ]]; then + + say "Applying Linux 6.1 boot-time regression mitigations" + if grep -q "Not affected" $itlb_multihit; then + echo "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" + if ! grep -q "never" $nx_huge_pages 2>/dev/null; then + kvm_extra_params+=(nx_huge_pages=never) + need_kvm_reload=1 fi - done - tail -v $ITLB_MULTIHIT $NX_HUGEPAGES - # else (m5d Skylake and CascadeLake) - else - echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" - mount |grep cgroup |grep -q favordynmods - if [ $? -ne 0 ]; then - say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" else - echo "favordynmods is enabled" + echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" + if mount | grep cgroup | grep -q favordynmods; then + echo "favordynmods is enabled" + else + say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" + fi fi fi -} + # AMD: ensure AVIC is enabled + local avic_param=/sys/module/kvm_amd/parameters/avic + if [[ $vendor_mod == "kvm_amd" ]]; then + if ! grep -q "Y\|1" $avic_param; then + echo "AVIC not enabled, will reload kvm_amd with avic=1" + need_kvm_reload=1 + fi + fi + + if [[ $need_kvm_reload -eq 1 ]]; then + echo "Reloading KVM modules" + reload_kvm_modules "$vendor_mod" "${kvm_extra_params[@]}" + ok_or_die "Could not reload kvm modules" + fi + + tail -v $itlb_multihit $nx_huge_pages + if [[ $vendor_mod == "kvm_amd" ]]; then + tail -v $avic_param + fi + + [[ -c /dev/kvm ]] || die "/dev/kvm not found. Aborting." +} # Modifies the processors CPU governor and P-state configuration (x86_64 only) for consistent performance. This means # - Disable turbo boost (Intel only) by writing 1 to /sys/devices/system/cpu/intel_pstate/no_turbo @@ -759,7 +800,7 @@ cmd_test() { done # Check prerequisites. - [ $do_kvm_check != 0 ] && ensure_kvm + [ $do_kvm_check != 0 ] && with_kvm_module_lock setup_kvm ensure_devctr ensure_build_dir ensure_ci_artifacts @@ -771,8 +812,6 @@ cmd_test() { fi fi - apply_linux_61_tweaks - # If we got to here, we've got all we need to continue. say "Kernel version: $(uname -r)" say "$(sed '/^processor.*: 0$/,/^processor.*: 1$/!d; /^processor.*: 1$/d' /proc/cpuinfo)" From 26b4c55f14f02fbb7dd3353769e608fa90d22445 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 26 Feb 2026 13:33:54 +0000 Subject: [PATCH 13/53] chore: release v1.14.2 Update version number / CHANGELOG / CREDITS Signed-off-by: Riccardo Mancini --- CREDITS.md | 1 + Cargo.lock | 12 ++++++------ docs/RELEASE_POLICY.md | 4 ++-- src/cpu-template-helper/Cargo.toml | 2 +- src/firecracker/Cargo.toml | 2 +- src/firecracker/swagger/firecracker.yaml | 2 +- src/jailer/Cargo.toml | 2 +- src/rebase-snap/Cargo.toml | 2 +- src/seccompiler/Cargo.toml | 2 +- src/snapshot-editor/Cargo.toml | 2 +- 10 files changed, 16 insertions(+), 15 deletions(-) diff --git a/CREDITS.md b/CREDITS.md index abc698944d6..f15b0797896 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -130,6 +130,7 @@ Contributors to the Firecracker repository: - huang-jl <1046678590@qq.com> - Iggy Jackson - ihciah +- Ilias Stamatis - Ioana Chirca - Ishwor Gurung - Iulian Barbu diff --git a/Cargo.lock b/Cargo.lock index 708e381624b..0b6a4dd7d56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -391,7 +391,7 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" -version = "1.14.1" +version = "1.14.2" dependencies = [ "clap", "displaydoc", @@ -554,7 +554,7 @@ checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "firecracker" -version = "1.14.1" +version = "1.14.2" dependencies = [ "cargo_toml", "displaydoc", @@ -703,7 +703,7 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jailer" -version = "1.14.1" +version = "1.14.2" dependencies = [ "libc", "log-instrument", @@ -1078,7 +1078,7 @@ dependencies = [ [[package]] name = "rebase-snap" -version = "1.14.1" +version = "1.14.2" dependencies = [ "displaydoc", "libc", @@ -1178,7 +1178,7 @@ dependencies = [ [[package]] name = "seccompiler" -version = "1.14.1" +version = "1.14.2" dependencies = [ "bincode", "clap", @@ -1275,7 +1275,7 @@ checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "snapshot-editor" -version = "1.14.1" +version = "1.14.2" dependencies = [ "clap", "clap-num", diff --git a/docs/RELEASE_POLICY.md b/docs/RELEASE_POLICY.md index a5997ddc507..dfc0e1abe51 100644 --- a/docs/RELEASE_POLICY.md +++ b/docs/RELEASE_POLICY.md @@ -90,8 +90,8 @@ v3.1 will be patched since were the last two Firecracker releases and less than | Release | Release Date | Latest Patch | Min. end of support | Official end of Support | | ------: | -----------: | -----------: | ------------------: | :------------------------------ | -| v1.14 | 2025-12-17 | v1.14.0 | 2026-06-17 | Supported | -| v1.13 | 2025-08-28 | v1.13.1 | 2026-02-28 | Supported | +| v1.14 | 2025-12-17 | v1.14.2 | 2026-06-17 | Supported | +| v1.13 | 2025-08-28 | v1.13.2 | 2026-02-28 | Supported | | v1.12 | 2025-05-07 | v1.12.1 | 2025-11-07 | 2025-12-17 (v1.14 released) | | v1.11 | 2025-03-18 | v1.11.0 | 2025-09-18 | 2025-09-18 (end of 6mo support) | | v1.10 | 2024-11-07 | v1.10.1 | 2025-05-07 | 2025-05-07 (v1.12 released) | diff --git a/src/cpu-template-helper/Cargo.toml b/src/cpu-template-helper/Cargo.toml index f0f585131fd..13016632041 100644 --- a/src/cpu-template-helper/Cargo.toml +++ b/src/cpu-template-helper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cpu-template-helper" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 02d89ec4183..01d5bb63725 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecracker" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" build = "build.rs" diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 0523dd9b08e..ca27fb5d362 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -5,7 +5,7 @@ info: The API is accessible through HTTP calls on specific URLs carrying JSON modeled data. The transport medium is a Unix Domain Socket. - version: 1.14.1 + version: 1.14.2 termsOfService: "" contact: email: "firecracker-maintainers@amazon.com" diff --git a/src/jailer/Cargo.toml b/src/jailer/Cargo.toml index 6a72c64a885..2c3b838d7bc 100644 --- a/src/jailer/Cargo.toml +++ b/src/jailer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jailer" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" description = "Process for starting Firecracker in production scenarios; applies a cgroup/namespace isolation barrier and then drops privileges." diff --git a/src/rebase-snap/Cargo.toml b/src/rebase-snap/Cargo.toml index 8f6cee0f895..4aaeaec24ff 100644 --- a/src/rebase-snap/Cargo.toml +++ b/src/rebase-snap/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rebase-snap" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/seccompiler/Cargo.toml b/src/seccompiler/Cargo.toml index bdd3832a8ea..a8d5e65ad95 100644 --- a/src/seccompiler/Cargo.toml +++ b/src/seccompiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "seccompiler" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" description = "Program that compiles multi-threaded seccomp-bpf filters expressed as JSON into raw BPF programs, serializing them and outputting them to a file." diff --git a/src/snapshot-editor/Cargo.toml b/src/snapshot-editor/Cargo.toml index bd1f93926e3..70045df8886 100644 --- a/src/snapshot-editor/Cargo.toml +++ b/src/snapshot-editor/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snapshot-editor" -version = "1.14.1" +version = "1.14.2" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" From b115e08156b4d283642f59100218bc88a11fcd96 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 5 Mar 2026 14:38:40 +0000 Subject: [PATCH 14/53] chore(deps): bump aws-lc-rs Bump aws-lc-rs to 1.16.1 to take the newest aws-lc-sys 0.38.0 which is required for security advisory. (cherry picked from commit f056e65a221e8f1127e824522ff434c4bed201d3) Signed-off-by: Jack Thomson --- Cargo.lock | 70 ++++++---------------------------------------- src/vmm/Cargo.toml | 2 +- 2 files changed, 10 insertions(+), 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0b6a4dd7d56..fee78d03d8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,27 +89,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "aws-lc-fips-sys" -version = "0.13.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57900537c00a0565a35b63c4c281b372edfc9744b072fd4a3b414350a8f5ed48" -dependencies = [ - "bindgen 0.72.1", - "cc", - "cmake", - "dunce", - "fs_extra", - "regex", -] - [[package]] name = "aws-lc-rs" -version = "1.15.1" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b5ce75405893cd713f9ab8e297d8e438f624dde7d706108285f7e17a25a180f" +checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" dependencies = [ - "aws-lc-fips-sys", "aws-lc-sys", "untrusted", "zeroize", @@ -117,11 +102,10 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.34.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "179c3777a8b5e70e90ea426114ffc565b2c1a9f82f6c4a0c5a34aa6ef5e781b6" +checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" dependencies = [ - "bindgen 0.72.1", "cc", "cmake", "dunce", @@ -169,27 +153,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", - "shlex", - "syn", -] - -[[package]] -name = "bindgen" -version = "0.72.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" -dependencies = [ - "bitflags 2.10.0", - "cexpr", - "clang-sys", - "itertools 0.12.1", - "log", - "prettyplease", - "proc-macro2", - "quote", - "regex", - "rustc-hash 2.1.1", + "rustc-hash", "shlex", "syn", ] @@ -533,7 +497,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -983,16 +947,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn", -] - [[package]] name = "proc-macro2" version = "1.0.103" @@ -1123,12 +1077,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustix" version = "0.38.44" @@ -1152,7 +1100,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.9.4", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -1460,7 +1408,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc91d95a797a81604af22946d0e86656f27feb0b9665c60665cf3554df12d1a8" dependencies = [ - "bindgen 0.69.5", + "bindgen", "cc", "cfg-if", ] @@ -1693,7 +1641,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index b6ab412a862..84a7083323f 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,7 +17,7 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } arrayvec = { version = "0.7.6", optional = true } -aws-lc-rs = { version = "1.15.1", features = ["bindgen"] } +aws-lc-rs = "1.16.1" base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.10.0" From d57d16929dd62afcbb7669e899a5668adc89abb0 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Fri, 6 Mar 2026 10:16:57 +0000 Subject: [PATCH 15/53] fix(mmds): validate tcp opt len Previously the value of the opt len was not validated which contradicts the spec. Per RFC 9293 (MUST-7), opt len includes the kind and length bytes so the minimum valid value length is 2. Reported-by: Kai Mitsuzawa (@kaizawa97) (cherry picked from commit fa3e4d7d9acf505d5cf06c645c88952ad7910406) Signed-off-by: Jack Thomson --- src/vmm/src/dumbo/pdu/tcp.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/dumbo/pdu/tcp.rs b/src/vmm/src/dumbo/pdu/tcp.rs index 4ff1da93dd7..dc160581252 100644 --- a/src/vmm/src/dumbo/pdu/tcp.rs +++ b/src/vmm/src/dumbo/pdu/tcp.rs @@ -257,7 +257,13 @@ impl TcpSegment<'_, T> { } _ => { // Some other option; just skip opt_len bytes in total. - i += b[i + 1] as usize; + // Per RFC 9293 (MUST-7), opt_len includes the kind and + // length bytes so the minimum valid value is 2. + let opt_len = b[i + 1] as usize; + if opt_len < 2 { + return Err(TcpError::MssOption); + } + i += opt_len; continue; } } From 8e72c3d3d7466c26feede31722597330c3b6e77c Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Fri, 6 Mar 2026 11:17:49 +0000 Subject: [PATCH 16/53] test(mmds): assert opt length validation Ensure we reject invalid option lengths (cherry picked from commit e0eb5e7a323f251a93fe804093df7d1ac34f0e6a) Signed-off-by: Jack Thomson --- src/vmm/src/dumbo/pdu/tcp.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/vmm/src/dumbo/pdu/tcp.rs b/src/vmm/src/dumbo/pdu/tcp.rs index dc160581252..38d710229d8 100644 --- a/src/vmm/src/dumbo/pdu/tcp.rs +++ b/src/vmm/src/dumbo/pdu/tcp.rs @@ -818,4 +818,25 @@ mod tests { TcpError::MssRemaining ); } + + #[test] + fn test_invalid_tcp_option_len() { + // Build a minimal segment with header_len = 24 (OPTIONS_OFFSET + 4 bytes of options). + let mut buf = [0u8; 100]; + let header_len: u8 = OPTIONS_OFFSET + 4; + { + let mut seg = TcpSegment::from_bytes_unchecked(buf.as_mut()); + seg.set_header_len_rsvd_ns(header_len, false); + } + // Write an unknown option kind (0xFF) with opt_len = 0 (invalid, < 2). + let opts_start = usize::from(OPTIONS_OFFSET); + buf[opts_start] = 0xFF; + buf[opts_start + 1] = 0; + + let seg = TcpSegment::from_bytes_unchecked(buf.as_ref()); + assert_eq!( + seg.parse_mss_option_unchecked(header_len.into()), + Err(TcpError::MssOption) + ); + } } From 2471ae0a8fb0a07c3944dc29688abd540897511f Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Tue, 10 Mar 2026 14:56:50 +0000 Subject: [PATCH 17/53] chore(changelog): add MMDS tcp option length entry Add new entry for the MMDS TCP option length fix Signed-off-by: Jack Thomson --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f8899c9464..bcc1740f525 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.14.3] + +### Fixed + +- [#5739](https://github.com/firecracker-microvm/firecracker/pull/5739): Fixed + validation of TCP SYN options length when MMDS is enabled. + ## [1.14.2] ### Fixed From 9154cfe5ca760f21eaab7532792a1dc710627048 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Fri, 13 Mar 2026 14:12:53 +0000 Subject: [PATCH 18/53] chore: release v1.14.3 Update version number / CHANGELOG / CREDITS Signed-off-by: Jack Thomson --- Cargo.lock | 12 ++++++------ src/cpu-template-helper/Cargo.toml | 2 +- src/firecracker/Cargo.toml | 2 +- src/firecracker/swagger/firecracker.yaml | 2 +- src/jailer/Cargo.toml | 2 +- src/rebase-snap/Cargo.toml | 2 +- src/seccompiler/Cargo.toml | 2 +- src/snapshot-editor/Cargo.toml | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fee78d03d8c..60b966513b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -355,7 +355,7 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" -version = "1.14.2" +version = "1.14.3" dependencies = [ "clap", "displaydoc", @@ -518,7 +518,7 @@ checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "firecracker" -version = "1.14.2" +version = "1.14.3" dependencies = [ "cargo_toml", "displaydoc", @@ -667,7 +667,7 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jailer" -version = "1.14.2" +version = "1.14.3" dependencies = [ "libc", "log-instrument", @@ -1032,7 +1032,7 @@ dependencies = [ [[package]] name = "rebase-snap" -version = "1.14.2" +version = "1.14.3" dependencies = [ "displaydoc", "libc", @@ -1126,7 +1126,7 @@ dependencies = [ [[package]] name = "seccompiler" -version = "1.14.2" +version = "1.14.3" dependencies = [ "bincode", "clap", @@ -1223,7 +1223,7 @@ checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "snapshot-editor" -version = "1.14.2" +version = "1.14.3" dependencies = [ "clap", "clap-num", diff --git a/src/cpu-template-helper/Cargo.toml b/src/cpu-template-helper/Cargo.toml index 13016632041..910b38a8d5b 100644 --- a/src/cpu-template-helper/Cargo.toml +++ b/src/cpu-template-helper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cpu-template-helper" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 01d5bb63725..8b6a5e1087a 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecracker" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" build = "build.rs" diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index ca27fb5d362..c68f7fe04e0 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -5,7 +5,7 @@ info: The API is accessible through HTTP calls on specific URLs carrying JSON modeled data. The transport medium is a Unix Domain Socket. - version: 1.14.2 + version: 1.14.3 termsOfService: "" contact: email: "firecracker-maintainers@amazon.com" diff --git a/src/jailer/Cargo.toml b/src/jailer/Cargo.toml index 2c3b838d7bc..46195fb922a 100644 --- a/src/jailer/Cargo.toml +++ b/src/jailer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jailer" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" description = "Process for starting Firecracker in production scenarios; applies a cgroup/namespace isolation barrier and then drops privileges." diff --git a/src/rebase-snap/Cargo.toml b/src/rebase-snap/Cargo.toml index 4aaeaec24ff..69457726b8f 100644 --- a/src/rebase-snap/Cargo.toml +++ b/src/rebase-snap/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rebase-snap" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/seccompiler/Cargo.toml b/src/seccompiler/Cargo.toml index a8d5e65ad95..657c5e6bfab 100644 --- a/src/seccompiler/Cargo.toml +++ b/src/seccompiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "seccompiler" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" description = "Program that compiles multi-threaded seccomp-bpf filters expressed as JSON into raw BPF programs, serializing them and outputting them to a file." diff --git a/src/snapshot-editor/Cargo.toml b/src/snapshot-editor/Cargo.toml index 70045df8886..d358db4e39e 100644 --- a/src/snapshot-editor/Cargo.toml +++ b/src/snapshot-editor/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snapshot-editor" -version = "1.14.2" +version = "1.14.3" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" From 8a00171beb5446094c773e1bea65f536d8ca2895 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 13 Mar 2026 13:53:02 +0000 Subject: [PATCH 19/53] fix(entropy): cap per-request entropy allocation to 64 KiB Overlapping descriptors within a single chain can cause buffer.len() to exceed the distinct guest memory backing the request. Without a bound, handle_one() allocates a vec proportional to the inflated length, which can reach ~4 GiB from a 17 MiB guest. Introduce MAX_ENTROPY_BYTES (64 KiB) and clamp the allocation in handle_one() to that limit. Legitimate requests are unaffected since a 256-entry descriptor chain with typical page-sized buffers fits well within the cap. Add tests covering the capped path, the large inflated buffer path, and the pass-through for small requests. Signed-off-by: Nikita Kalyazin --- CHANGELOG.md | 10 ++ src/vmm/src/devices/virtio/rng/device.rs | 140 ++++++++++++++++++++++- 2 files changed, 147 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcc1740f525..7bd13118b81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.14.4] + +### Fixed + +- [#5762](https://github.com/firecracker-microvm/firecracker/pull/5762): Cap + virtio-rng per-request entropy to 64 KiB. Previously, a guest could construct + a descriptor chain that caused Firecracker to allocate more host memory than + the guest actually provided, potentially leading to excessive host memory + consumption. + ## [1.14.3] ### Fixed diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 2f9efd80909..f32171db06a 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -28,6 +28,14 @@ use crate::vstate::memory::GuestMemoryMmap; pub const ENTROPY_DEV_ID: &str = "rng"; +/// Maximum number of bytes `handle_one()` will serve per request. +/// +/// Overlapping descriptors within a single chain can cause `buffer.len()` to +/// exceed the amount of distinct guest memory actually backing the request. +/// Capping the per-request allocation to 64 KiB keeps host memory usage +/// bounded regardless of how the descriptor chain is constructed. +const MAX_ENTROPY_BYTES: u32 = 64 * 1024; + #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum EntropyError { /// Error while handling an Event file descriptor: {0} @@ -119,14 +127,19 @@ impl Entropy { return Ok(0); } - let mut rand_bytes = vec![0; self.buffer.len() as usize]; + // Cap the number of bytes we actually generate so that the host-side + // allocation stays bounded even when buffer.len() is inflated by + // overlapping descriptors in the chain. + let len = std::cmp::min(self.buffer.len(), MAX_ENTROPY_BYTES); + + let mut rand_bytes = vec![0; len as usize]; rand::fill(&mut rand_bytes).inspect_err(|_| { METRICS.host_rng_fails.inc(); })?; - // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0. + // It is ok to unwrap here. We are writing `len` bytes at offset 0. self.buffer.write_all_volatile_at(&rand_bytes, 0).unwrap(); - Ok(self.buffer.len()) + Ok(len) } fn process_entropy_queue(&mut self) -> Result<(), InvalidAvailIdx> { @@ -611,4 +624,125 @@ mod tests { // The rate limiter event should have processed the pending buffer as well assert_eq!(METRICS.entropy_bytes.count(), entropy_bytes + 128); } + + /// Verify that handle_one() caps the host allocation to MAX_ENTROPY_BYTES + /// when overlapping descriptors inflate buffer.len() beyond the limit. + #[test] + fn test_handle_one_caps_overlapping_descriptors() { + use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + // 32 descriptors × 4 KiB = 128 KiB claimed, which exceeds MAX_ENTROPY_BYTES (64 KiB). + const N_DESC: u16 = 32; + const CHUNK: u32 = 4096; + + let mem = single_region_mem(0x20000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + let target: u64 = 0x10000; + for i in 0..N_DESC { + let flags = VIRTQ_DESC_F_WRITE | if i < N_DESC - 1 { VIRTQ_DESC_F_NEXT } else { 0 }; + vq.dtable[i as usize].set(target, CHUNK, flags, i + 1); + } + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + // buffer.len() is inflated well past the cap. + assert_eq!(buf.len(), u32::from(N_DESC) * CHUNK); // 128 KiB + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, + MAX_ENTROPY_BYTES, + "handle_one() must cap at MAX_ENTROPY_BYTES ({MAX_ENTROPY_BYTES}), got {bytes} for \ + inflated buffer.len() = {}", + u32::from(N_DESC) * CHUNK + ); + } + + /// Verify that handle_one() caps a large inflated buffer (~4 GiB from + /// 255 overlapping descriptors) to MAX_ENTROPY_BYTES. + #[test] + fn test_handle_one_caps_large_inflated_buffer() { + use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + const N_DESC: u16 = 255; + const CHUNK: u32 = 16 * 1024 * 1024; // 16 MiB + const TOTAL: u64 = (N_DESC as u64) * (CHUNK as u64); // ~4 GiB + + let mem = single_region_mem((CHUNK as usize) + 0x100000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + let target: u64 = 0x80000; + for i in 0..N_DESC { + let flags = VIRTQ_DESC_F_WRITE | if i < N_DESC - 1 { VIRTQ_DESC_F_NEXT } else { 0 }; + vq.dtable[i as usize].set(target, CHUNK, flags, i + 1); + } + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(buf.len() as u64, TOTAL); + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, MAX_ENTROPY_BYTES, + "handle_one() must cap at MAX_ENTROPY_BYTES, not allocate {} bytes", + TOTAL + ); + } + + /// Verify that a request within MAX_ENTROPY_BYTES is served in full + /// (the cap does not truncate legitimate small requests). + #[test] + fn test_handle_one_serves_small_request_in_full() { + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + const SIZE: u32 = 256; + + let mem = single_region_mem(0x20000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + vq.dtable[0].set(0x10000, SIZE, VIRTQ_DESC_F_WRITE, 0); + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(buf.len(), SIZE); + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, SIZE, + "small request ({SIZE} bytes) should be served in full, got {bytes}" + ); + } } From 2e1a4c016e20f68a50afe2609811e1ffbf4645c0 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Mon, 23 Mar 2026 06:17:51 +0000 Subject: [PATCH 20/53] chore: Update aws-lc-rs 1.16.2 A vulnerability was found in aws-lc-sys [1]. Although Firecracker isn't affected as it doesn't use AWS-LC to validate CN, update aws-lc-rs (and aws-lc-sys indirectly) to suppress cargo-audit failure. [1]: https://rustsec.org/advisories/RUSTSEC-2026-0044.html Signed-off-by: Takahiro Itazuri --- Cargo.lock | 8 ++++---- deny.toml | 3 +-- src/vmm/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 60b966513b6..1c9b4fef3c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -91,9 +91,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-lc-rs" -version = "1.16.1" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ "aws-lc-sys", "untrusted", @@ -102,9 +102,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.38.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" dependencies = [ "cc", "cmake", diff --git a/deny.toml b/deny.toml index be3a1040463..99d17573d47 100644 --- a/deny.toml +++ b/deny.toml @@ -5,8 +5,7 @@ allow = [ "Apache-2.0", "BSD-3-Clause", "ISC", - "Unicode-3.0", - "OpenSSL" + "Unicode-3.0" ] [[bans.deny]] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 84a7083323f..a09109251ac 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,7 +17,7 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } arrayvec = { version = "0.7.6", optional = true } -aws-lc-rs = "1.16.1" +aws-lc-rs = "1.16.2" base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.10.0" From 253140c90e984038dae06c0be9085f6504d04df9 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 2 Apr 2026 13:03:07 +0100 Subject: [PATCH 21/53] fix(tests/spectre-meltdown-checker): pin due to bugs in tip Pin back spectre-meltdown-checker.sh to previous working version due to issues introduced in speed47/spectre-meltdown-checker#527 while we're investigating them. Signed-off-by: Riccardo Mancini --- tests/integration_tests/security/test_vulnerabilities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index b787196f6f5..a951332e1df 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -17,7 +17,8 @@ from framework.microvm import MicroVMFactory from framework.properties import global_props -CHECKER_URL = "https://raw.githubusercontent.com/speed47/spectre-meltdown-checker/master/spectre-meltdown-checker.sh" +# Pinned due to issues introduced in https://github.com/speed47/spectre-meltdown-checker/pull/527 +CHECKER_URL = "https://raw.githubusercontent.com/speed47/spectre-meltdown-checker/3a822fdcf291ebb8bfbcb77aa216ac342c6b2f12/spectre-meltdown-checker.sh" CHECKER_FILENAME = "spectre-meltdown-checker.sh" REMOTE_CHECKER_PATH = f"/tmp/{CHECKER_FILENAME}" REMOTE_CHECKER_COMMAND = f"sh {REMOTE_CHECKER_PATH} --no-intel-db --batch json" From f47fb0fb179ae4d207c03c7a5637015584f8de35 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 18 Mar 2026 12:45:25 +0000 Subject: [PATCH 22/53] refactor(pci): Move device status constants to pci/mod.rs Move the virtio device status constants from pci/device.rs to a shared pci/mod.rs so they can be referenced by both device.rs and common_config-rs (the latter will use them in a subsequent commit). The MMIO transport defines its own device_status constants as u32 (matching the 32-bit status register [1]), while the PCI transport uses u8 (matching the 8-bit device_status field [2]). Both are spec-correct. Unifying those two in a single module is left as a future refactoring to keep this change simpler. No functional changes intended. [1]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1820002 [2]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 Signed-off-by: Takahiro Itazuri --- .../devices/virtio/transport/pci/device.rs | 41 +++++++------------ .../src/devices/virtio/transport/pci/mod.rs | 14 +++++++ 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index f0cc8bdefc7..620eaa755ab 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -34,6 +34,7 @@ use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config::{ VirtioPciCommonConfig, VirtioPciCommonConfigState, }; +use crate::devices::virtio::transport::pci::device_status::*; use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::{debug, error}; use crate::pci::configuration::{PciCapability, PciConfiguration, PciConfigurationState}; @@ -46,13 +47,6 @@ use crate::vstate::interrupts::{InterruptError, MsixVectorGroup}; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::resources::ResourceAllocator; -const DEVICE_INIT: u8 = 0x00; -const DEVICE_ACKNOWLEDGE: u8 = 0x01; -const DEVICE_DRIVER: u8 = 0x02; -const DEVICE_DRIVER_OK: u8 = 0x04; -const DEVICE_FEATURES_OK: u8 = 0x08; -const DEVICE_FAILED: u8 = 0x80; - /// Vector value used to disable MSI for a queue. pub const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; @@ -468,15 +462,14 @@ impl VirtioPciDevice { } fn is_driver_ready(&self) -> bool { - let ready_bits = - (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + let ready_bits = (ACKNOWLEDGE | DRIVER | DRIVER_OK | FEATURES_OK); self.common_config.driver_status == ready_bits - && self.common_config.driver_status & DEVICE_FAILED == 0 + && self.common_config.driver_status & FAILED == 0 } /// Determines if the driver has requested the device (re)init / reset itself fn is_driver_init(&self) -> bool { - self.common_config.driver_status == DEVICE_INIT + self.common_config.driver_status == INIT } pub fn config_bar_addr(&self) -> u64 { @@ -929,7 +922,7 @@ impl PciDevice for VirtioPciDevice { error!("Attempt to reset device when not implemented in underlying device"); // TODO: currently we don't support device resetting, but we still // follow the spec and set the status field to 0. - self.common_config.driver_status = DEVICE_INIT; + self.common_config.driver_status = INIT; } } } @@ -960,7 +953,6 @@ mod tests { use crate::arch::MEM_64BIT_DEVICES_START; use crate::builder::tests::default_vmm; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::device_status::{ACKNOWLEDGE, DRIVER, DRIVER_OK, FEATURES_OK}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids; use crate::devices::virtio::rng::Entropy; @@ -970,6 +962,9 @@ mod tests { NOTIFY_OFF_MULTIPLIER, PciVirtioSubclass, VirtioPciCap, VirtioPciCfgCap, VirtioPciNotifyCap, }; + use crate::devices::virtio::transport::pci::device_status::{ + ACKNOWLEDGE, DRIVER, DRIVER_OK, FEATURES_OK, + }; use crate::pci::PciDevice; use crate::pci::msix::MsixCap; use crate::rate_limiter::RateLimiter; @@ -1557,14 +1552,8 @@ mod tests { .load(std::sync::atomic::Ordering::SeqCst) ); - write_driver_status( - &mut locked_virtio_pci_device, - ACKNOWLEDGE.try_into().unwrap(), - ); - write_driver_status( - &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER).try_into().unwrap(), - ); + write_driver_status(&mut locked_virtio_pci_device, ACKNOWLEDGE); + write_driver_status(&mut locked_virtio_pci_device, ACKNOWLEDGE | DRIVER); assert!(!locked_virtio_pci_device.is_driver_init()); assert!(!locked_virtio_pci_device.is_driver_ready()); assert!( @@ -1574,7 +1563,7 @@ mod tests { ); let status = read_driver_status(&mut locked_virtio_pci_device); - assert_eq!(status as u32, ACKNOWLEDGE | DRIVER); + assert_eq!(status, ACKNOWLEDGE | DRIVER); // Entropy device just offers VIRTIO_F_VERSION_1 let offered_features = read_device_features(&mut locked_virtio_pci_device); @@ -1583,10 +1572,10 @@ mod tests { write_driver_features(&mut locked_virtio_pci_device, offered_features); write_driver_status( &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER | FEATURES_OK).try_into().unwrap(), + ACKNOWLEDGE | DRIVER | FEATURES_OK, ); let status = read_driver_status(&mut locked_virtio_pci_device); - assert!((status & u8::try_from(FEATURES_OK).unwrap()) != 0); + assert!((status & FEATURES_OK) != 0); assert!(!locked_virtio_pci_device.is_driver_init()); assert!(!locked_virtio_pci_device.is_driver_ready()); @@ -1600,9 +1589,7 @@ mod tests { write_driver_status( &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK) - .try_into() - .unwrap(), + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, ); assert!(!locked_virtio_pci_device.is_driver_init()); diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs index 520b52274b3..c286e301d9e 100644 --- a/src/vmm/src/devices/virtio/transport/pci/mod.rs +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -3,3 +3,17 @@ pub mod common_config; pub mod device; + +/// Virtio device status field values +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-110001 +/// +/// These are u8 because the PCI transport's device_status register is 8 bits wide. +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +pub(crate) mod device_status { + pub const INIT: u8 = 0x00; + pub const ACKNOWLEDGE: u8 = 0x01; + pub const DRIVER: u8 = 0x02; + pub const DRIVER_OK: u8 = 0x04; + pub const FEATURES_OK: u8 = 0x08; + pub const FAILED: u8 = 0x80; +} From baa7c268ffaa45d15caee20829ed2488bd4bfdcc Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 14:57:51 +0000 Subject: [PATCH 23/53] chore(pci): Remove redundant FAILED check in is_driver_ready() The equality check against the exact ready_bits value already guarantees FAILED is not set. No functional changes intended. Signed-off-by: Takahiro Itazuri --- src/vmm/src/devices/virtio/transport/pci/device.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 620eaa755ab..aee61e0b46a 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -464,7 +464,6 @@ impl VirtioPciDevice { fn is_driver_ready(&self) -> bool { let ready_bits = (ACKNOWLEDGE | DRIVER | DRIVER_OK | FEATURES_OK); self.common_config.driver_status == ready_bits - && self.common_config.driver_status & FAILED == 0 } /// Determines if the driver has requested the device (re)init / reset itself From 931e6a98a16040288eddcc77a89014cb88748aa6 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 16:05:57 +0000 Subject: [PATCH 24/53] refactor(pci): Replace raw hex offsets with named constants Add common_config_offset module with named constants for all virtio PCI common configuration offsets. Replace all raw hex literals with these constants to improve readability and reduce the risk of offset typos. No functional changes intended. Signed-off-by: Takahiro Itazuri --- .../virtio/transport/pci/common_config.rs | 242 ++++++++---------- .../devices/virtio/transport/pci/device.rs | 61 +++-- .../src/devices/virtio/transport/pci/mod.rs | 48 ++++ 3 files changed, 202 insertions(+), 149 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 70876d7aefc..9558ed41814 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -16,6 +16,7 @@ use vm_memory::GuestAddress; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config_offset::*; use crate::devices::virtio::transport::pci::device::VIRTQ_MSI_NO_VECTOR; use crate::logger::warn; @@ -34,28 +35,6 @@ pub struct VirtioPciCommonConfigState { /// Contains the data for reading and writing the common configuration structure of a virtio PCI /// device. -/// -/// * Registers: -/// -/// ** About the whole device. -/// le32 device_feature_select; // 0x00 // read-write -/// le32 device_feature; // 0x04 // read-only for driver -/// le32 driver_feature_select; // 0x08 // read-write -/// le32 driver_feature; // 0x0C // read-write -/// le16 msix_config; // 0x10 // read-write -/// le16 num_queues; // 0x12 // read-only for driver -/// u8 device_status; // 0x14 // read-write (driver_status) -/// u8 config_generation; // 0x15 // read-only for driver -/// -/// ** About a specific virtqueue. -/// le16 queue_select; // 0x16 // read-write -/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. -/// le16 queue_msix_vector; // 0x1A // read-write -/// le16 queue_enable; // 0x1C // read-write (Ready) -/// le16 queue_notify_off; // 0x1E // read-only for driver -/// le64 queue_desc; // 0x20 // read-write -/// le64 queue_avail; // 0x28 // read-write -/// le64 queue_used; // 0x30 // read-write #[derive(Debug)] pub struct VirtioPciCommonConfig { pub driver_status: u8, @@ -136,8 +115,8 @@ impl VirtioPciCommonConfig { fn read_common_config_byte(&self, offset: u64) -> u8 { // The driver is only allowed to do aligned, properly sized access. match offset { - 0x14 => self.driver_status, - 0x15 => self.config_generation, + DEVICE_STATUS => self.driver_status, + CONFIG_GENERATION => self.config_generation, _ => { warn!("pci: invalid virtio config byte read: 0x{:x}", offset); 0 @@ -147,7 +126,7 @@ impl VirtioPciCommonConfig { fn write_common_config_byte(&mut self, offset: u64, value: u8) { match offset { - 0x14 => self.driver_status = value, + DEVICE_STATUS => self.driver_status = value, _ => { warn!("pci: invalid virtio config byte write: 0x{:x}", offset); } @@ -156,25 +135,25 @@ impl VirtioPciCommonConfig { fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { match offset { - 0x10 => self.msix_config.load(Ordering::Acquire), - 0x12 => queues.len().try_into().unwrap(), // num_queues - 0x16 => self.queue_select, - 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + MSIX_CONFIG => self.msix_config.load(Ordering::Acquire), + NUM_QUEUES => queues.len().try_into().unwrap(), + QUEUE_SELECT => self.queue_select, + QUEUE_SIZE => self.with_queue(queues, |q| q.size).unwrap_or(0), // If `queue_select` points to an invalid queue we should return NO_VECTOR. // Reading from here // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1280005: // // > The device MUST return vector mapped to a given event, (NO_VECTOR if unmapped) on // > read of config_msix_vector/queue_msix_vector. - 0x1a => self + QUEUE_MSIX_VECTOR => self .msix_queues .lock() .unwrap() .get(self.queue_select as usize) .copied() .unwrap_or(VIRTQ_MSI_NO_VECTOR), - 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), - 0x1e => self.queue_select, // notify_off + QUEUE_ENABLE => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + QUEUE_NOTIFY_OFF => self.queue_select, _ => { warn!("pci: invalid virtio register word read: 0x{:x}", offset); 0 @@ -184,7 +163,7 @@ impl VirtioPciCommonConfig { fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { match offset { - 0x10 => { + MSIX_CONFIG => { // Make sure that the guest doesn't select an invalid vector. We are offering // `num_queues + 1` vectors (plus one for configuration updates). If an invalid // vector has been selected, we just store the `NO_VECTOR` value. @@ -198,9 +177,9 @@ impl VirtioPciCommonConfig { .store(VIRTQ_MSI_NO_VECTOR, Ordering::Release); } } - 0x16 => self.queue_select = value, - 0x18 => self.with_queue_mut(queues, |q| q.size = value), - 0x1a => { + QUEUE_SELECT => self.queue_select = value, + QUEUE_SIZE => self.with_queue_mut(queues, |q| q.size = value), + QUEUE_MSIX_VECTOR => { let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); let nr_vectors = msix_queues.len() + 1; // Make sure that `queue_select` points to a valid queue. If not, we won't do @@ -216,7 +195,7 @@ impl VirtioPciCommonConfig { } } } - 0x1c => self.with_queue_mut(queues, |q| { + QUEUE_ENABLE => self.with_queue_mut(queues, |q| { if value != 0 { q.ready = value == 1; } @@ -229,8 +208,8 @@ impl VirtioPciCommonConfig { fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { match offset { - 0x00 => self.device_feature_select, - 0x04 => { + DEVICE_FEATURE_SELECT => self.device_feature_select, + DEVICE_FEATURE => { let locked_device = device.lock().unwrap(); // Only 64 bits of features (2 pages) are defined for now, so limit // device_feature_select to avoid shifting by 64 or more bits. @@ -241,43 +220,43 @@ impl VirtioPciCommonConfig { 0 } } - 0x08 => self.driver_feature_select, - 0x20 => { + DRIVER_FEATURE_SELECT => self.driver_feature_select, + QUEUE_DESC_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.desc_table_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x24 => { + QUEUE_DESC_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.desc_table_address.0 >> 32) as u32 }) .unwrap_or_default() } - 0x28 => { + QUEUE_AVAIL_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.avail_ring_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x2c => { + QUEUE_AVAIL_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.avail_ring_address.0 >> 32) as u32 }) .unwrap_or_default() } - 0x30 => { + QUEUE_USED_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.used_ring_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x34 => { + QUEUE_USED_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.used_ring_address.0 >> 32) as u32 @@ -308,25 +287,25 @@ impl VirtioPciCommonConfig { let mut locked_device = device.lock().unwrap(); match offset { - 0x00 => self.device_feature_select = value, - 0x08 => self.driver_feature_select = value, - 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), - 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + DEVICE_FEATURE_SELECT => self.device_feature_select = value, + DRIVER_FEATURE_SELECT => self.driver_feature_select = value, + DRIVER_FEATURE => locked_device.ack_features_by_page(self.driver_feature_select, value), + QUEUE_DESC_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { lo(&mut q.desc_table_address, value) }), - 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_DESC_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { hi(&mut q.desc_table_address, value) }), - 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { lo(&mut q.avail_ring_address, value) }), - 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { hi(&mut q.avail_ring_address, value) }), - 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { lo(&mut q.used_ring_address, value) }), - 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { hi(&mut q.used_ring_address, value) }), _ => { @@ -355,6 +334,7 @@ mod tests { use super::*; use crate::devices::virtio::transport::mmio::tests::DummyDevice; + use crate::devices::virtio::transport::pci::common_config_offset::*; fn default_device() -> Arc> { Arc::new(Mutex::new(DummyDevice::new())) @@ -386,53 +366,53 @@ mod tests { let dev = Arc::new(Mutex::new(DummyDevice::new())); // Can set all bits of driver_status. - regs.write(0x14, &[0x55], dev.clone()); + regs.write(DEVICE_STATUS, &[0x55], dev.clone()); let mut read_back = vec![0x00]; - regs.read(0x14, &mut read_back, dev.clone()); + regs.read(DEVICE_STATUS, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0x55); // The config generation register is read only. - regs.write(0x15, &[0xaa], dev.clone()); + regs.write(CONFIG_GENERATION, &[0xaa], dev.clone()); let mut read_back = vec![0x00]; - regs.read(0x15, &mut read_back, dev.clone()); + regs.read(CONFIG_GENERATION, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + regs.write(DEVICE_FEATURE, &[0, 0, 0, 0], dev.clone()); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x04, &mut read_back, dev.clone()); + regs.read(DEVICE_FEATURE, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0u32); // Feature select registers are read/write. - regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + regs.write(DEVICE_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone()); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x00, &mut read_back, dev.clone()); + regs.read(DEVICE_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); - regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + regs.write(DRIVER_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone()); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x08, &mut read_back, dev.clone()); + regs.read(DRIVER_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); // 'queue_select' can be read and written. - regs.write(0x16, &[0xaa, 0x55], dev.clone()); + regs.write(QUEUE_SELECT, &[0xaa, 0x55], dev.clone()); let mut read_back = vec![0x00, 0x00]; - regs.read(0x16, &mut read_back, dev.clone()); + regs.read(QUEUE_SELECT, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); // Getting the MSI vector when `queue_select` points to an invalid queue should return // NO_VECTOR (0xffff) - regs.read(0x1a, &mut read_back, dev.clone()); + regs.read(QUEUE_MSIX_VECTOR, &mut read_back, dev.clone()); assert_eq!(read_back, [0xff, 0xff]); // Writing the MSI vector of an invalid `queue_select` does not have any effect. - regs.write(0x1a, &[0x12, 0x13], dev.clone()); + regs.write(QUEUE_MSIX_VECTOR, &[0x12, 0x13], dev.clone()); assert_eq!(read_back, [0xff, 0xff]); // Valid `queue_select` though should setup the corresponding MSI-X queue. - regs.write(0x16, &[0x1, 0x0], dev.clone()); + regs.write(QUEUE_SELECT, &[0x1, 0x0], dev.clone()); assert_eq!(regs.queue_select, 1); - regs.write(0x1a, &[0x1, 0x0], dev.clone()); - regs.read(0x1a, &mut read_back, dev); + regs.write(QUEUE_MSIX_VECTOR, &[0x1, 0x0], dev.clone()); + regs.read(QUEUE_MSIX_VECTOR, &mut read_back, dev); assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1); } @@ -447,15 +427,15 @@ mod tests { .unwrap() .set_avail_features(0x0000_1312_0000_1110); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1110); // select second page - config.write(0x0, 1u32.as_slice(), device.clone()); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.write(DEVICE_FEATURE_SELECT, 1u32.as_slice(), device.clone()); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1312); // Try a third page. It doesn't exist so we should get all 0s - config.write(0x0, 2u32.as_slice(), device.clone()); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.write(DEVICE_FEATURE_SELECT, 2u32.as_slice(), device.clone()); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x0); } @@ -469,11 +449,11 @@ mod tests { .set_avail_features(0x0000_1312_0000_1110); // ACK some features of the first page - config.write(0x0c, 0x1100u32.as_slice(), device.clone()); + config.write(DRIVER_FEATURE, 0x1100u32.as_slice(), device.clone()); assert_eq!(device.lock().unwrap().acked_features(), 0x1100); // ACK some features of the second page - config.write(0x08, 1u32.as_slice(), device.clone()); - config.write(0x0c, 0x0000_1310u32.as_slice(), device.clone()); + config.write(DRIVER_FEATURE_SELECT, 1u32.as_slice(), device.clone()); + config.write(DRIVER_FEATURE, 0x0000_1310u32.as_slice(), device.clone()); assert_eq!( device.lock().unwrap().acked_features(), 0x0000_1310_0000_1100 @@ -486,11 +466,11 @@ mod tests { let mut device = default_device(); let mut num_queues = 0u16; - config.read(0x12, num_queues.as_mut_slice(), device.clone()); + config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); // `num_queues` is read-only - config.write(0x12, 4u16.as_slice(), device.clone()); - config.read(0x12, num_queues.as_mut_slice(), device.clone()); + config.write(NUM_QUEUES, 4u16.as_slice(), device.clone()); + config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); } @@ -500,10 +480,10 @@ mod tests { let mut device = default_device(); let mut status = 0u8; - config.read(0x14, status.as_mut_slice(), device.clone()); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, 0); - config.write(0x14, 0x42u8.as_slice(), device.clone()); - config.read(0x14, status.as_mut_slice(), device.clone()); + config.write(DEVICE_STATUS, 0x42u8.as_slice(), device.clone()); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, 0x42); } @@ -516,14 +496,14 @@ mod tests { // Our device has 2 queues, so we should be using 3 vectors in total. // Trying to set a vector bigger than that should fail. Observing the // failure happens through a subsequent read that should return NO_VECTOR. - config.write(0x10, 3u16.as_slice(), device.clone()); - config.read(0x10, vector.as_mut_slice(), device.clone()); + config.write(MSIX_CONFIG, 3u16.as_slice(), device.clone()); + config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for i in 0u16..3 { - config.write(0x10, i.as_slice(), device.clone()); - config.read(0x10, vector.as_mut_slice(), device.clone()); + config.write(MSIX_CONFIG, i.as_slice(), device.clone()); + config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, i); } } @@ -536,8 +516,8 @@ mod tests { let mut max_size = [0u16; 2]; for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); - config.read(0x18, len.as_mut_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!( len, device.lock().unwrap().queues()[queue_id as usize].max_size @@ -545,19 +525,19 @@ mod tests { max_size[queue_id as usize] = len; } - config.write(0x16, 2u16.as_slice(), device.clone()); - config.read(0x18, len.as_mut_slice(), device.clone()); + config.write(QUEUE_SELECT, 2u16.as_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, 0); // Setup size smaller than what is the maximum offered for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); config.write( - 0x18, + QUEUE_SIZE, (max_size[queue_id as usize] - 1).as_slice(), device.clone(), ); - config.read(0x18, len.as_mut_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[queue_id as usize] - 1); } } @@ -573,16 +553,16 @@ mod tests { // failure happens through a subsequent read that should return NO_VECTOR. for queue_id in 0u16..2 { // Select queue - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - config.write(0x1a, 3u16.as_slice(), device.clone()); - config.read(0x1a, vector.as_mut_slice(), device.clone()); + config.write(QUEUE_MSIX_VECTOR, 3u16.as_slice(), device.clone()); + config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for vector_id in 0u16..3 { - config.write(0x1a, vector_id.as_slice(), device.clone()); - config.read(0x1a, vector.as_mut_slice(), device.clone()); + config.write(QUEUE_MSIX_VECTOR, vector_id.as_slice(), device.clone()); + config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, vector_id); } } @@ -595,20 +575,20 @@ mod tests { let mut enabled = 0u16; for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); // Initially queue should be disabled - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 0); // Enable queue - config.write(0x1c, 1u16.as_slice(), device.clone()); - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone()); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); // According to the specification "The driver MUST NOT write a 0 to queue_enable." - config.write(0x1c, 0u16.as_slice(), device.clone()); - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone()); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); } } @@ -624,13 +604,13 @@ mod tests { // a field setup by the device and should be read-only for the driver for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); - config.read(0x1e, offset.as_mut_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); // Writing to it should not have any effect - config.write(0x1e, 0x42.as_slice(), device.clone()); - config.read(0x1e, offset.as_mut_slice(), device.clone()); + config.write(QUEUE_NOTIFY_OFF, 0x42.as_slice(), device.clone()); + config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); } } @@ -669,9 +649,9 @@ mod tests { let mut reg64bit = 0; for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - for offset in [0x20, 0x28, 0x30] { + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); assert_eq!( read_64bit_field(&mut config, device.clone(), offset), @@ -697,51 +677,51 @@ mod tests { device.lock().unwrap().queues_mut()[0].desc_table_address = GuestAddress(0x0000_1312_0000_1110); let mut buffer = [0u8; 8]; - config.read(0x20, &mut buffer[..1], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..2], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..8], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..4], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x1110); - config.read(0x24, &mut buffer[..4], device.clone()); + config.read(QUEUE_DESC_HI, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x1312); // 32-bit fields config.device_feature_select = 0x42; let mut buffer = [0u8; 8]; - config.read(0, &mut buffer[..1], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..2], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..8], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..4], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x42); // 16-bit fields let mut buffer = [0u8; 8]; config.queue_select = 0x42; - config.read(0x16, &mut buffer[..1], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..4], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..4], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..8], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..2], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..2], device.clone()); assert_eq!(LittleEndian::read_u16(&buffer[..2]), 0x42); // 8-bit fields let mut buffer = [0u8; 8]; config.driver_status = 0x42; - config.read(0x14, &mut buffer[..2], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..4], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..4], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..8], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..1], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..1], device.clone()); assert_eq!(buffer[0], 0x42); } } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index aee61e0b46a..1109868eb80 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -955,6 +955,7 @@ mod tests { use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids; use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::common_config_offset::*; use crate::devices::virtio::transport::pci::device::{ COMMON_CONFIG_BAR_OFFSET, COMMON_CONFIG_SIZE, DEVICE_CONFIG_BAR_OFFSET, DEVICE_CONFIG_SIZE, ISR_CONFIG_BAR_OFFSET, ISR_CONFIG_SIZE, NOTIFICATION_BAR_OFFSET, NOTIFICATION_SIZE, @@ -1365,15 +1366,15 @@ mod tests { let mut locked_virtio_pci_device = device.lock().unwrap(); // Let's read the number of queues of the entropy device - // That information is located at offset 0x12 past the BAR region belonging to the common - // config capability. - let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET).unwrap() + 0x12; + // That information is located at NUM_QUEUES offset past the BAR region belonging to the + // common config capability. + let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + NUM_QUEUES).unwrap(); let len = 2u32; let num_queues = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); assert_eq!(num_queues, 1); // Let's update the driver features and see if that takes effect - let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET).unwrap() + 0x14; + let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS).unwrap(); let len = 1u32; let device_status = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); assert_eq!(device_status, 0); @@ -1475,28 +1476,44 @@ mod tests { } fn write_driver_status(device: &mut VirtioPciDevice, status: u8) { - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x14, status.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS, + status.as_slice(), + ); } fn read_driver_status(device: &mut VirtioPciDevice) -> u8 { let mut status = 0u8; - device.read_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x14, status.as_mut_slice()); + device.read_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS, + status.as_mut_slice(), + ); status } fn read_device_features(device: &mut VirtioPciDevice) -> u64 { let mut features_lo = 0u32; - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET, 0u32.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE_SELECT, + 0u32.as_slice(), + ); device.read_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x4, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE, features_lo.as_mut_slice(), ); let mut features_hi = 0u32; - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET, 1u32.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE_SELECT, + 1u32.as_slice(), + ); device.read_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x4, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE, features_hi.as_mut_slice(), ); @@ -1504,16 +1521,24 @@ mod tests { } fn write_driver_features(device: &mut VirtioPciDevice, features: u64) { - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x8, 0u32.as_slice()); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0xc, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE_SELECT, + 0u32.as_slice(), + ); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE, ((features & 0xffff_ffff) as u32).as_slice(), ); - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x8, 1u32.as_slice()); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0xc, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE_SELECT, + 1u32.as_slice(), + ); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE, (((features >> 32) & 0xffff_ffff) as u32).as_slice(), ); } @@ -1521,20 +1546,20 @@ mod tests { fn setup_queues(device: &mut VirtioPciDevice) { device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x20, + COMMON_CONFIG_BAR_OFFSET + QUEUE_DESC_LO, 0x8000_0000u64.as_slice(), ); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x28, + COMMON_CONFIG_BAR_OFFSET + QUEUE_AVAIL_LO, 0x8000_1000u64.as_slice(), ); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x30, + COMMON_CONFIG_BAR_OFFSET + QUEUE_USED_LO, 0x8000_2000u64.as_slice(), ); - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x1c, 1u16.as_slice()); + device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + QUEUE_ENABLE, 1u16.as_slice()); } #[test] diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs index c286e301d9e..a5f9f54bcd8 100644 --- a/src/vmm/src/devices/virtio/transport/pci/mod.rs +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -17,3 +17,51 @@ pub(crate) mod device_status { pub const FEATURES_OK: u8 = 0x08; pub const FAILED: u8 = 0x80; } + +/// Virtio PCI common configuration register offsets +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +/// ```c +/// struct virtio_pci_common_config { +/// /* About the whole device. */ +/// le32 device_feature_select; /* read-write */ +/// le32 device_feature; /* read-only for driver */ +/// le32 driver_feature_select; /* read-write */ +/// le32 driver_feature; /* read-write */ +/// le16 msix_config; /* read-write */ +/// le16 num_queues; /* read-only for driver */ +/// u8 device_status; /* read-write */ +/// u8 config_generation; /* read-only for driver */ +/// +/// /* About a specific virtqueue. */ +/// le16 queue_select; /* read-write */ +/// le16 queue_size; /* read-write, power of 2, or 0. */ +/// le16 queue_msix_vector; /* read-write */ +/// le16 queue_enable; /* read-write */ +/// le16 queue_notify_off; /* read-only for driver */ +/// le64 queue_desc; /* read-write */ +/// le64 queue_avail; /* read-write */ +/// le64 queue_used; /* read-write */ +/// }; +/// ``` +pub(crate) mod common_config_offset { + pub const DEVICE_FEATURE_SELECT: u64 = 0x00; + pub const DEVICE_FEATURE: u64 = 0x04; + pub const DRIVER_FEATURE_SELECT: u64 = 0x08; + pub const DRIVER_FEATURE: u64 = 0x0c; + pub const MSIX_CONFIG: u64 = 0x10; + pub const NUM_QUEUES: u64 = 0x12; + pub const DEVICE_STATUS: u64 = 0x14; + pub const CONFIG_GENERATION: u64 = 0x15; + + pub const QUEUE_SELECT: u64 = 0x16; + pub const QUEUE_SIZE: u64 = 0x18; + pub const QUEUE_MSIX_VECTOR: u64 = 0x1a; + pub const QUEUE_ENABLE: u64 = 0x1c; + pub const QUEUE_NOTIFY_OFF: u64 = 0x1e; + pub const QUEUE_DESC_LO: u64 = 0x20; + pub const QUEUE_DESC_HI: u64 = 0x24; + pub const QUEUE_AVAIL_LO: u64 = 0x28; + pub const QUEUE_AVAIL_HI: u64 = 0x2c; + pub const QUEUE_USED_LO: u64 = 0x30; + pub const QUEUE_USED_HI: u64 = 0x34; +} From 7b51f8160a1338b5739baf57da3d35b8ca8b6cfa Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 18 Mar 2026 14:58:32 +0000 Subject: [PATCH 25/53] fix(pci): Check device status before virtqueue config writes Per the virtio spec [1], the driver initialization sequence requires queue configuration to only be done between FEATURES_OK and DRIVER_OK. Introduce update_queue_field() to enforce this constraint, mirroring the guard already present in the virtio MMIO transport. Writes are rejected with a warning if FEATURES_OK is not set or if DRIVER_OK/FAILED is set. Add unit tests verifying writes to queue fields (queue_size, queue_enable, queue_desc, queue_avail, queue_used) are ignored before FEATURES_OK and after DRIVER_OK. [1]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 Signed-off-by: Takahiro Itazuri --- .../virtio/transport/pci/common_config.rs | 113 ++++++++++++++++-- 1 file changed, 100 insertions(+), 13 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 9558ed41814..023da895a49 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -18,6 +18,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config_offset::*; use crate::devices::virtio::transport::pci::device::VIRTQ_MSI_NO_VECTOR; +use crate::devices::virtio::transport::pci::device_status::*; use crate::logger::warn; pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; @@ -161,6 +162,25 @@ impl VirtioPciCommonConfig { } } + /// Guard queue configuration field writes based on device status. + /// + /// Per the virtio spec, the driver SHALL follow this sequence: + /// INIT -> ACKNOWLEDGE -> DRIVER -> FEATURES_OK -> DRIVER_OK + /// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + /// + /// Queue configuration must only be done between FEATURES_OK and DRIVER_OK. + fn update_queue_field(&mut self, queues: &mut [Queue], f: F) { + let status = self.driver_status; + if status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) { + self.with_queue_mut(queues, f); + } else { + warn!( + "pci: queue config write not allowed in device status {:#x}", + status + ); + } + } + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { match offset { MSIX_CONFIG => { @@ -178,7 +198,7 @@ impl VirtioPciCommonConfig { } } QUEUE_SELECT => self.queue_select = value, - QUEUE_SIZE => self.with_queue_mut(queues, |q| q.size = value), + QUEUE_SIZE => self.update_queue_field(queues, |q| q.size = value), QUEUE_MSIX_VECTOR => { let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); let nr_vectors = msix_queues.len() + 1; @@ -195,7 +215,7 @@ impl VirtioPciCommonConfig { } } } - QUEUE_ENABLE => self.with_queue_mut(queues, |q| { + QUEUE_ENABLE => self.update_queue_field(queues, |q| { if value != 0 { q.ready = value == 1; } @@ -290,22 +310,22 @@ impl VirtioPciCommonConfig { DEVICE_FEATURE_SELECT => self.device_feature_select = value, DRIVER_FEATURE_SELECT => self.driver_feature_select = value, DRIVER_FEATURE => locked_device.ack_features_by_page(self.driver_feature_select, value), - QUEUE_DESC_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_DESC_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.desc_table_address, value) }), - QUEUE_DESC_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_DESC_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.desc_table_address, value) }), - QUEUE_AVAIL_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.avail_ring_address, value) }), - QUEUE_AVAIL_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.avail_ring_address, value) }), - QUEUE_USED_LO => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.used_ring_address, value) }), - QUEUE_USED_HI => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.used_ring_address, value) }), _ => { @@ -378,7 +398,7 @@ mod tests { assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(DEVICE_FEATURE, &[0, 0, 0, 0], dev.clone()); + regs.write(DEVICE_FEATURE, &[1, 2, 3, 4], dev.clone()); let mut read_back = vec![0, 0, 0, 0]; regs.read(DEVICE_FEATURE, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0u32); @@ -525,10 +545,23 @@ mod tests { max_size[queue_id as usize] = len; } + // Before FEATURES_OK is set, the driver should not be able to change the queue size. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); + assert_eq!(len, max_size[queue_id as usize]); + } + + // Verify writing a queue size to a non-existent queue is ignored. config.write(QUEUE_SELECT, 2u16.as_slice(), device.clone()); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, 0); + // Set FEATURES_OK so that the driver can change the queue size. + config.driver_status |= FEATURES_OK; + // Setup size smaller than what is the maximum offered for queue_id in 0u16..2 { config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); @@ -540,6 +573,13 @@ mod tests { config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[queue_id as usize] - 1); } + + // Verify writes are rejected after DRIVER_OK is set. + config.driver_status |= DRIVER_OK; + config.write(QUEUE_SELECT, 0u16.as_slice(), device.clone()); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); + assert_eq!(len, max_size[0] - 1); } #[test] @@ -574,19 +614,40 @@ mod tests { let device = default_device(); let mut enabled = 0u16; + // Initially queue should be disabled for queue_id in 0u16..2 { config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); + assert_eq!(enabled, 0); + } - // Initially queue should be disabled + // Enabling a queue before FEATURES_OK should be ignored. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone()); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 0); + } - // Enable queue + // Set FEATURES_OK so that the driver can enable the queue. + config.driver_status |= FEATURES_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone()); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); - // According to the specification "The driver MUST NOT write a 0 to queue_enable." + // The driver MUST NOT write a 0 to queue_enable. + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone()); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); + assert_eq!(enabled, 1); + } + + // Verify writes are rejected after DRIVER_OK + config.driver_status |= DRIVER_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone()); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); @@ -646,8 +707,20 @@ mod tests { fn test_queue_addresses() { let mut config = default_pci_common_config(); let device = default_device(); - let mut reg64bit = 0; + // Before FEATURES_OK is set, the driver should not be able to change the queue addresses. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { + write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); + assert_eq!(read_64bit_field(&mut config, device.clone(), offset), 0); + } + } + + // Set status so queue fields can be modified + config.driver_status |= FEATURES_OK; for queue_id in 0u16..2 { config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); @@ -659,6 +732,20 @@ mod tests { ); } } + + // Verify writes are rejected after DRIVER_OK + config.driver_status |= DRIVER_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { + write_64bit_field(&mut config, device.clone(), offset, 0xDEAD_BEEF); + assert_eq!( + read_64bit_field(&mut config, device.clone(), offset), + 0x0000_1312_0000_1110 + ); + } + } } #[test] From 5f791c1fcb00e00a8064a9cbcdb1ff042e4da42b Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 20 Mar 2026 10:27:44 +0000 Subject: [PATCH 26/53] test: Use queue_select instead of device_status for PCI config cap test Rewrite test_pci_configuration_cap to use queue_select (offset 0x16) instead of device_status. The test's purpose is to verify the PCI Configuration Access Capability mechanism (indirect BAR access via PCI config space), not device_status behavior. queue_select is freely read-writable without constraints unlike device_status. Signed-off-by: Takahiro Itazuri --- .../devices/virtio/transport/pci/device.rs | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 1109868eb80..c79ffca2e34 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -1373,43 +1373,52 @@ mod tests { let num_queues = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); assert_eq!(num_queues, 1); - // Let's update the driver features and see if that takes effect - let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS).unwrap(); - let len = 1u32; - let device_status = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); - assert_eq!(device_status, 0); + // Use queue_select to test read/write through the PCI Configuration Access Capability. + // This register is freely read-writable with no side effects, making it ideal for testing + // the capability mechanism itself. + let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + QUEUE_SELECT).unwrap(); + let len = 2u32; + let val = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); + assert_eq!(val, 0); + cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, len, - 0x42u32.as_slice(), + 0x01u32.as_slice(), ); - let device_status = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); - assert_eq!(device_status, 0x42); + let val = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); + assert_eq!(val, 0x01); - // reads with out-of-bounds lengths should return 0s + // Reads with out-of-bounds lengths should return 0s assert_eq!( cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, 8), 0 ); - // writes out-of-bounds lengths should have no effect + // Writes with out-of-bounds lengths should have no effect cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, 8, - 0x84u32.as_slice(), + 0xDEADu32.as_slice(), ); assert_eq!( - cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, 1), - 0x42 + cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len), + val ); - // Make sure that we handle properly from/to a BAR where the access length doesn't match - // what we've set in the capability's length + + // When the capability's length is shorter than pci_cfg_data (4 bytes), only that many + // bytes should be forwarded to the BAR write. Writing 0xDEAD_0000 with length=2 should + // only write the lower 2 bytes (0x0000). cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, - 2, - 0x42u8.as_slice(), + len, + 0xDEAD_0000u32.as_slice(), + ); + assert_eq!( + cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len), + 0x0000 ); } From 4242cb5bbe3c019fa1fe8292bf128a6ebca1458f Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 18 Mar 2026 18:13:04 +0000 Subject: [PATCH 27/53] fix(pci): Enforce device status state machine Add set_device_status() to enforce the virtio spec initialization sequence [1]: INIT -> ACKNOLWEDGE -> DRIVER -> FEATURE_OK -> DRIVER_OK The state machine validates that each transition sets exactly one new bit while preserving all previously set bits, per the virtio spec [2] "The driver MUST NOT clear a device status bit." Add a test for valid state transitions and a negative test for invalid transitions. [1]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 [2]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-120001 Signed-off-by: Takahiro Itazuri --- .../virtio/transport/pci/common_config.rs | 155 +++++++++++++++++- 1 file changed, 146 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 023da895a49..a329a359fd9 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -127,13 +127,69 @@ impl VirtioPciCommonConfig { fn write_common_config_byte(&mut self, offset: u64, value: u8) { match offset { - DEVICE_STATUS => self.driver_status = value, + DEVICE_STATUS => self.set_device_status(value), _ => { warn!("pci: invalid virtio config byte write: 0x{:x}", offset); } } } + fn set_device_status(&mut self, status: u8) { + /// Enforce the device status state machine per the virtio spec: + /// INIT -> ACKNOWLEDGE -> DRIVER -> FEATURES_OK -> DRIVER_OK + /// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + /// + /// Each step sets exactly one new bit while preserving all previous bits. + const VALID_TRANSITIONS: &[(u8, u8)] = &[ + (INIT, ACKNOWLEDGE), + (ACKNOWLEDGE, ACKNOWLEDGE | DRIVER), + (ACKNOWLEDGE | DRIVER, ACKNOWLEDGE | DRIVER | FEATURES_OK), + ( + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ), + ]; + + if (status & FAILED) != 0 { + // Something went wrong in the guest. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-110001 + // > FAILED (128) + // > Indicates that something went wrong in the guest, and it has given up on the + // > device. + self.driver_status |= FAILED; + } else if status == INIT { + // Reset requested by the driver. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1430001 + // > The device MUST reset when 0 is written to device_status, and present a 0 in + // > device_status once that is done. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1440002 + // > After writing 0 to device_status, the driver MUST wait for a read of device_status + // > to return 0 before reinitializing the device. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-200001 + // > 2.4.1 Device Requirements: Device Reset + // > A device MUST reinitialize device status to 0 after receiving a reset. + // + // Setting INIT (0) here before the actual reset completes in write_bar() may appear + // racy - the driver could read 0 before the device is fully torn down. But concurrent + // access is serialized since VirtioPciDevice is accessed through Arc>. + self.driver_status = INIT; + } else if VALID_TRANSITIONS + .iter() + .any(|&(from, to)| self.driver_status == from && status == to) + { + self.driver_status = status; + } else { + warn!( + "pci: invalid virtio device status transition: {:#x} -> {:#x}", + self.driver_status, status + ); + } + } + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { match offset { MSIX_CONFIG => self.msix_config.load(Ordering::Acquire), @@ -385,11 +441,6 @@ mod tests { }; let dev = Arc::new(Mutex::new(DummyDevice::new())); - // Can set all bits of driver_status. - regs.write(DEVICE_STATUS, &[0x55], dev.clone()); - let mut read_back = vec![0x00]; - regs.read(DEVICE_STATUS, &mut read_back, dev.clone()); - assert_eq!(read_back[0], 0x55); // The config generation register is read only. regs.write(CONFIG_GENERATION, &[0xaa], dev.clone()); @@ -497,14 +548,100 @@ mod tests { #[test] fn test_device_status() { let mut config = default_pci_common_config(); - let mut device = default_device(); + let device = default_device(); let mut status = 0u8; + // Initial status should be INIT (0) config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, 0); - config.write(DEVICE_STATUS, 0x42u8.as_slice(), device.clone()); + + // Valid state transitions + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone()); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER).as_slice(), + device.clone(), + ); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); - assert_eq!(status, 0x42); + assert_eq!(status, ACKNOWLEDGE | DRIVER); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + ); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK).as_slice(), + device.clone(), + ); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + + // Reset should always work + config.write(DEVICE_STATUS, INIT.as_slice(), device.clone()); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + } + + #[test] + fn test_device_status_invalid_transitions() { + let mut config = default_pci_common_config(); + let device = default_device(); + + // Helper to attempt a transition and verify it was rejected. + let mut assert_rejected = |config: &mut VirtioPciCommonConfig, new: u8, expected: u8| { + config.write(DEVICE_STATUS, new.as_slice(), device.clone()); + let mut s = 0u8; + config.read(DEVICE_STATUS, s.as_mut_slice(), device.clone()); + assert_eq!(s, expected, "transition to {new:#x} should be rejected"); + }; + + // Check the initial state is INIT (0) + let mut status = 0; + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + + // Skip ACKNOWLEDGE: INIT -> ACKNOWLEDGE | DRIVER + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, INIT); + // Arbitrary value from INIT + assert_rejected(&mut config, 0x42, INIT); + + // Advance to ACKNOWLEDGE | DRIVER | FEATURES_OK + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone()); + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER).as_slice(), + device.clone(), + ); + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + ); + let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; + + // Go back: FEATURES_OK -> DRIVER + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, expected); + // Valid transition FEATURES_OK -> DRIVER_OK but without cumulative bits + assert_rejected(&mut config, DRIVER_OK, expected); + + // Advance to FEATURES_OK + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + ); + let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; + + // Go back from FEATURES_OK + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, expected); } #[test] From 5d9bf8b4f91ebfac098f968f25f9257c2743ace9 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 07:07:07 +0000 Subject: [PATCH 28/53] fix(mmio): Reject missing cumulative bits in device status The match on `!self.device_status & status` only checked which new bit was being set, but did not verify that all previously set bits were preserved in the written value. This allowed transitions like writing bare DRIVER_OK from FEATURES_OK state, which clears ACKNOWLEDGE, DRIVER, and FEATURE_OK bits in violation of the virtio spec [1]: "The driver MUST NOT clear a device status bit." Add a guard ensuring the written value is exactly the current status with one new bit added. [1]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-120001 Signed-off-by: Takahiro Itazuri --- src/vmm/src/devices/virtio/transport/mmio.rs | 148 +++++++++++++------ 1 file changed, 102 insertions(+), 46 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index d98dd4ce365..60a436dfdef 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -168,23 +168,51 @@ impl MmioTransport { #[allow(unused_assignments)] fn set_device_status(&mut self, status: u32) { use device_status::*; - // match changed bits - match !self.device_status & status { - ACKNOWLEDGE if self.device_status == INIT => { - self.device_status = status; - } - DRIVER if self.device_status == ACKNOWLEDGE => { - self.device_status = status; + + const VALID_TRANSITIONS: &[(u32, u32)] = &[ + (INIT, ACKNOWLEDGE), + (ACKNOWLEDGE, ACKNOWLEDGE | DRIVER), + (ACKNOWLEDGE | DRIVER, ACKNOWLEDGE | DRIVER | FEATURES_OK), + ( + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ), + ]; + + if (status & FAILED) != 0 { + // TODO: notify backend driver to stop the device + self.device_status |= FAILED; + } else if status == INIT { + { + let mut locked_device = self.device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + let mut device_status = self.device_status; + let reset_result = locked_device.reset(); + match reset_result { + Some((_interrupt_evt, mut _queue_evts)) => {} + None => { + device_status |= FAILED; + } + } + self.device_status = device_status; + } } - FEATURES_OK if self.device_status == (ACKNOWLEDGE | DRIVER) => { - self.device_status = status; + + // If the backend device driver doesn't support reset, + // just leave the device marked as FAILED. + if self.device_status & FAILED == 0 { + self.reset(); } - DRIVER_OK if self.device_status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) => { - self.device_status = status; + } else if VALID_TRANSITIONS + .iter() + .any(|&(from, to)| self.device_status == from && status == to) + { + self.device_status = status; + + // Activate the device when transitioning to DRIVER_OK. + if status == (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK) { let mut locked_device = self.device.lock().expect("Poisoned lock"); - let device_activated = locked_device.is_activated(); - if !device_activated { - // temporary variable needed for borrow checker + if !locked_device.is_activated() { let activate_result = locked_device.activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { @@ -198,38 +226,11 @@ impl MmioTransport { } } } - _ if (status & FAILED) != 0 => { - // TODO: notify backend driver to stop the device - self.device_status |= FAILED; - } - _ if status == 0 => { - { - let mut locked_device = self.device.lock().expect("Poisoned lock"); - if locked_device.is_activated() { - let mut device_status = self.device_status; - let reset_result = locked_device.reset(); - match reset_result { - Some((_interrupt_evt, mut _queue_evts)) => {} - None => { - device_status |= FAILED; - } - } - self.device_status = device_status; - } - } - - // If the backend device driver doesn't support reset, - // just leave the device marked as FAILED. - if self.device_status & FAILED == 0 { - self.reset(); - } - } - _ => { - warn!( - "invalid virtio driver status transition: {:#x} -> {:#x}", - self.device_status, status - ); - } + } else { + warn!( + "invalid virtio driver status transition: {:#x} -> {:#x}", + self.device_status, status + ); } } } @@ -1060,6 +1061,61 @@ pub(crate) mod tests { assert!(d.locked_device().is_activated()); } + fn read_device_status(d: &mut MmioTransport) -> u32 { + let mut buf = [0; 4]; + d.read(0x0, 0x70, &mut buf[..]); + read_le_u32(&buf[..]) + } + + #[test] + fn test_device_status_invalid_transitions() { + let m = single_region_mem(0x1000); + let interrupt: Arc = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); + + let mut assert_rejected = |d: &mut MmioTransport, new: u32, expected: u32| { + set_device_status(d, new); + assert_eq!( + read_device_status(d), + expected, + "transition to {new:#x} should be rejected" + ); + }; + + // Skip ACKNOWLEDGE: INIT -> ACKNOWLEDGE | DRIVER + assert_rejected( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER, + device_status::INIT, + ); + // Arbitrary value from INIT + assert_rejected(&mut d, 0x42, device_status::INIT); + + // Advance to ACKNOWLEDGE | DRIVER | FEATURES_OK + set_device_status(&mut d, device_status::ACKNOWLEDGE); + set_device_status(&mut d, device_status::ACKNOWLEDGE | device_status::DRIVER); + set_device_status( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER | device_status::FEATURES_OK, + ); + let expected = + device_status::ACKNOWLEDGE | device_status::DRIVER | device_status::FEATURES_OK; + + // Go back: FEATURES_OK -> DRIVER + assert_rejected( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER, + expected, + ); + // Valid transition FEATURES_OK -> DRIVER_OK but without cumulative bits + assert_rejected(&mut d, device_status::DRIVER_OK, expected); + } + #[test] fn test_bus_device_reset() { let m = single_region_mem(0x1000); From e5bb82722175b2f63e6558e6239337f18b92006f Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 08:04:20 +0000 Subject: [PATCH 29/53] test(pci): Add realistic guest-side test for queue config immutability We already have Rust unit tests for the immutability for virtio PCI queue config fields (queue_size, queue_eanble, queue_desc, queue_avail, queue_used): - test_queue_size - test_queue_enable - test_queue_addresses However, those queue config fields would be of the greatest interest from the security perspective among all the fields. To simulate a more realistic scenario, add an integration test that verifies those fields cannot be modified after boot. Signed-off-by: Takahiro Itazuri --- tests/conftest.py | 12 ++ tests/host_tools/devmem.c | 93 +++++++++++++++ .../integration_tests/functional/test_pci.py | 106 ++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 tests/host_tools/devmem.c diff --git a/tests/conftest.py b/tests/conftest.py index 7c777eb6abd..575e76be272 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -259,6 +259,18 @@ def change_net_config_space_bin(test_fc_session_root_path): yield change_net_config_space_bin +@pytest.fixture(scope="session") +def devmem_bin(test_fc_session_root_path): + """Build a minimal /dev/mem read/write tool.""" + bin_path = os.path.join(test_fc_session_root_path, "devmem") + build_tools.gcc_compile( + "host_tools/devmem.c", + bin_path, + extra_flags="-static", + ) + yield bin_path + + @pytest.fixture(scope="session") def waitpkg_bin(test_fc_session_root_path): """Build a binary that attempts to use WAITPKG (UMONITOR / UMWAIT)""" diff --git a/tests/host_tools/devmem.c b/tests/host_tools/devmem.c new file mode 100644 index 00000000000..49d1fc17438 --- /dev/null +++ b/tests/host_tools/devmem.c @@ -0,0 +1,93 @@ +// Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Minimal /dev/mem read/write tool for integration tests. +// +// Usage: +// devmem read +// devmem write +// +// : physical address (hex or decimal) +// : access width in bytes (1, 2, or 4) +// : value to write (hex or decimal, write only) +// +// On read, prints the value as a hex number to stdout. +// On write, writes the value then reads back and prints it. +// Exit code 0 on success, non-zero on failure. + +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 4) { + fprintf(stderr, + "Usage: %s read \n" + " %s write \n", + argv[0], argv[0]); + return 1; + } + + int is_write = strcmp(argv[1], "write") == 0; + if (is_write && argc < 5) { + fprintf(stderr, "write mode requires a value argument\n"); + return 1; + } + + uint64_t addr = strtoull(argv[2], NULL, 0); + int width = atoi(argv[3]); + uint64_t value = is_write ? strtoull(argv[4], NULL, 0) : 0; + + if (width != 1 && width != 2 && width != 4) { + fprintf(stderr, "width must be 1, 2, or 4\n"); + return 1; + } + + int fd = open("/dev/mem", O_RDWR | O_SYNC); + if (fd < 0) { + perror("open /dev/mem"); + return 1; + } + + uint64_t page_size = getpagesize(); + uint64_t page_addr = addr & ~(page_size - 1); + uint64_t offset_in_page = addr & (page_size - 1); + uint64_t map_size = page_size; + if (offset_in_page + width > page_size) + map_size *= 2; + + void *map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, page_addr); + if (map == MAP_FAILED) { + perror("mmap"); + close(fd); + return 1; + } + + volatile void *ptr = (volatile char *)map + offset_in_page; + + if (is_write) { + switch (width) { + case 1: *(volatile uint8_t *)ptr = (uint8_t)value; break; + case 2: *(volatile uint16_t *)ptr = (uint16_t)value; break; + case 4: *(volatile uint32_t *)ptr = (uint32_t)value; break; + } + } + + uint32_t result = 0; + switch (width) { + case 1: result = *(volatile uint8_t *)ptr; break; + case 2: result = *(volatile uint16_t *)ptr; break; + case 4: result = *(volatile uint32_t *)ptr; break; + } + + printf("0x%x\n", result); + + munmap(map, map_size); + close(fd); + return 0; +} diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py index dc0827b1aae..e4e26f4552e 100644 --- a/tests/integration_tests/functional/test_pci.py +++ b/tests/integration_tests/functional/test_pci.py @@ -2,6 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for the PCI devices""" +# Virtio PCI common config register offsets +# https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +COMMON_CFG_QUEUE_SELECT = 0x16 # u16 +COMMON_CFG_QUEUE_SIZE = 0x18 # u16 +COMMON_CFG_QUEUE_ENABLE = 0x1C # u16 +COMMON_CFG_QUEUE_DESC_LO = 0x20 # u32 +COMMON_CFG_QUEUE_DESC_HI = 0x24 # u32 +COMMON_CFG_QUEUE_AVAIL_LO = 0x28 # u32 +COMMON_CFG_QUEUE_AVAIL_HI = 0x2C # u32 +COMMON_CFG_QUEUE_USED_LO = 0x30 # u32 +COMMON_CFG_QUEUE_USED_HI = 0x34 # u32 + def test_pci_root_present(uvm_any_with_pci): """ @@ -26,3 +38,97 @@ def test_pci_disabled(uvm_any_without_pci): assert ( "00:00.0 Host bridge: Intel Corporation Device" not in stdout ), "PCI root not found in guest" + + +def _find_virtio_blk_bar(vm): + """Find the BAR0 physical address of the first virtio-blk PCI device. + + virtio-blk has PCI device ID 0x1042 (0x1040 + type 2). + + Example:: + + # lspci -n + 00:00.0 0600: 8086:0d57 + 00:01.0 0180: 1af4:1042 (rev 01) + + The resource file has one line per BAR. Each line contains three + space-separated hex values: start, end, flags. + + Example (BAR0 line):: + + # cat /sys/bus/pci/devices/0000:00:01.0/resource | head -1 + 0x0000004000000000 0x000000400007ffff 0x0000000000140204 + """ + stdout = vm.ssh.check_output("lspci -n").stdout.strip() + slot = None + for line in stdout.split("\n"): + parts = line.split() + if len(parts) >= 3 and parts[2] == "1af4:1042": + slot = f"0000:{parts[0]}" + break + assert slot is not None, "No virtio-blk PCI device found" + + cmd = f"cat /sys/bus/pci/devices/{slot}/resource | head -1" + stdout = vm.ssh.check_output(cmd).stdout.strip() + addr = int(stdout.split()[0], 16) + assert addr != 0, f"BAR0 address is 0 for {slot}" + return addr + + +def _devmem_read(vm, tool_path, addr, width): + """Read a physical address via /dev/mem.""" + cmd = f"{tool_path} read 0x{addr:x} {width}" + stdout = vm.ssh.check_output(cmd).stdout.strip() + return int(stdout, 16) + + +def _devmem_write(vm, tool_path, addr, width, value): + """Write a physical address via /dev/mem and return the read-back value.""" + cmd = f"{tool_path} write 0x{addr:x} {width} 0x{value:x}" + stdout = vm.ssh.check_output(cmd).stdout.strip() + return int(stdout, 16) + + +def test_queue_config_immutable(uvm_any_with_pci, devmem_bin): + """ + Test that queue configuration fields cannot be modified by the guest + after the device has been activated (DRIVER_OK is set). + + All PCI common config queue fields are read-write, so we can verify + immutability by writing a poison value and checking the readback still + equals the original. + + MMIO queue config immutability is covered by the Rust unit test + test_queue_config_immutable_after_activation in transport/mmio.rs. + MMIO queue fields are write-only (reads return 0), so integration-level + readback verification via /dev/mem is not possible. + """ + vm = uvm_any_with_pci + + rmt_path = "/tmp/devmem" + vm.ssh.scp_put(devmem_bin, rmt_path) + vm.ssh.check_output(f"chmod +x {rmt_path}") + + bar_addr = _find_virtio_blk_bar(vm) + + # Select queue 0 + _devmem_write(vm, rmt_path, bar_addr + COMMON_CFG_QUEUE_SELECT, 2, 0) + + # (name, offset, width, poison_value) + queue_fields = [ + ("queue_size", COMMON_CFG_QUEUE_SIZE, 2, 0), + ("queue_enable", COMMON_CFG_QUEUE_ENABLE, 2, 0), + ("queue_desc_lo", COMMON_CFG_QUEUE_DESC_LO, 4, 0xDEADBEEF), + ("queue_desc_hi", COMMON_CFG_QUEUE_DESC_HI, 4, 0xDEADBEEF), + ("queue_avail_lo", COMMON_CFG_QUEUE_AVAIL_LO, 4, 0xDEADBEEF), + ("queue_avail_hi", COMMON_CFG_QUEUE_AVAIL_HI, 4, 0xDEADBEEF), + ("queue_used_lo", COMMON_CFG_QUEUE_USED_LO, 4, 0xDEADBEEF), + ("queue_used_hi", COMMON_CFG_QUEUE_USED_HI, 4, 0xDEADBEEF), + ] + for name, offset, width, poison in queue_fields: + addr = bar_addr + offset + orig = _devmem_read(vm, rmt_path, addr, width) + readback = _devmem_write(vm, rmt_path, addr, width, poison) + assert ( + readback == orig + ), f"{name} should remain {orig:#x} after DRIVER_OK, got {readback:#x}" From 5c8b84743434fead4c37a8edf34fc99bf2e98631 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 20 Mar 2026 11:35:44 +0000 Subject: [PATCH 30/53] test(mmio): Add unit test for queue config immutability Add a Rust unit test that verifies virtio MMIO queue configuration fields (QueueNum, QueueReady, QueueDescLow/High, QueueAvailLow/High, QueueUsedLog/High) cannot be after the device has been activated. We do have this test especially for the queue config fields because they would be of the greatest interest from the security perspective among all the fields. Ideally thgis would be an integraion test like PCI counterpart, but MMIO queue config registers are write-only, making readback verification from the guest impossible. Signed-off-by: Takahiro Itazuri --- src/vmm/src/devices/virtio/transport/mmio.rs | 71 +++++++++++++++++--- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 60a436dfdef..8dfe17eb5b5 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -958,14 +958,6 @@ pub(crate) mod tests { | device_status::DRIVER_OK ); assert!(d.locked_device().is_activated()); - - // A write which changes the size of a queue after activation; currently only triggers - // a warning path and have no effect on queue state. - write_le_u32(&mut buf[..], 0); - d.queue_select = 0; - d.write(0x0, 0x44, &buf[..]); - d.read(0x0, 0x44, &mut buf[..]); - assert_eq!(read_le_u32(&buf[..]), 1); } #[test] @@ -1175,6 +1167,69 @@ pub(crate) mod tests { assert_eq!(dummy_dev.acked_features(), 24); } + #[test] + fn test_queue_config_immutable_after_activation() { + // Verify that writes to queue configuration fields are rejected after the device has been + // activated (DRIVER_OK). These MMIO registers are write-only (reads return 0), so this + // cannot be tested at the integration level via /dev/mem readback. + let mem = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); + let mut dev = MmioTransport::new( + mem, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); + activate_device(&mut dev); + + dev.queue_select = 0; + + // Save the queue state right after activation. + let size_before = dev.locked_device().queues()[0].size; + let ready_before = dev.locked_device().queues()[0].ready; + let desc_before = dev.locked_device().queues()[0].desc_table_address; + let avail_before = dev.locked_device().queues()[0].avail_ring_address; + let used_before = dev.locked_device().queues()[0].used_ring_address; + + // Attempt to poison every queue config register. + let mut buf = [0u8; 4]; + + // QueueNum (0x38) + write_le_u32(&mut buf, 0); + dev.write(0x0, 0x38, &buf); + assert_eq!(dev.locked_device().queues()[0].size, size_before); + + // QueueReady (0x44) + write_le_u32(&mut buf, 0); + dev.write(0x0, 0x44, &buf); + assert_eq!(dev.locked_device().queues()[0].ready, ready_before); + + // QueueDescLow/High (0x80, 0x84) + write_le_u32(&mut buf, 0xDEADBEEF); + dev.write(0x0, 0x80, &buf); + dev.write(0x0, 0x84, &buf); + assert_eq!( + dev.locked_device().queues()[0].desc_table_address, + desc_before + ); + + // QueueAvailLow/High (0x90, 0x94) + dev.write(0x0, 0x90, &buf); + dev.write(0x0, 0x94, &buf); + assert_eq!( + dev.locked_device().queues()[0].avail_ring_address, + avail_before + ); + + // QueueUsedLow/High (0xa0, 0xa4) + dev.write(0x0, 0xa0, &buf); + dev.write(0x0, 0xa4, &buf); + assert_eq!( + dev.locked_device().queues()[0].used_ring_address, + used_before + ); + } + #[test] fn irq_trigger() { let irq_trigger = IrqTrigger::new(); From 608d9dd3c0e4500dac55e5af59e19ef9d9f365a1 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 11:32:15 +0000 Subject: [PATCH 31/53] fix(pci): Block device re-initialization after unsupported reset When the backend virtio device doesn't implement reset(), the PCI transport sets device_status to INIT so that the Linux PCI driver's reset poll terminates correctly. However, the backend device is still active. Without any guard, the driver could do re-initialization against a stil-live backend device. Pass device_activated from VirtioPciDevice into set_device_status() through the write() -> write_common_config_byte() chain. When device_activated is true and device_status is already INIT, reject all status transitions. This condition uniquely identifies a post-failed-reset state and naturally survives snapshot restore since both device_activated and device_status are persisted. Signed-off-by: Takahiro Itazuri --- .../virtio/transport/pci/common_config.rs | 176 +++++++++++++----- .../devices/virtio/transport/pci/device.rs | 82 ++++++-- 2 files changed, 195 insertions(+), 63 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index a329a359fd9..e0fd832bc57 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -95,11 +95,17 @@ impl VirtioPciCommonConfig { } } - pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + pub fn write( + &mut self, + offset: u64, + data: &[u8], + device: Arc>, + device_activated: bool, + ) { assert!(data.len() <= 8); match data.len() { - 1 => self.write_common_config_byte(offset, data[0]), + 1 => self.write_common_config_byte(offset, data[0], device_activated), 2 => self.write_common_config_word( offset, LittleEndian::read_u16(data), @@ -125,16 +131,16 @@ impl VirtioPciCommonConfig { } } - fn write_common_config_byte(&mut self, offset: u64, value: u8) { + fn write_common_config_byte(&mut self, offset: u64, value: u8, device_activated: bool) { match offset { - DEVICE_STATUS => self.set_device_status(value), + DEVICE_STATUS => self.set_device_status(value, device_activated), _ => { warn!("pci: invalid virtio config byte write: 0x{:x}", offset); } } } - fn set_device_status(&mut self, status: u8) { + fn set_device_status(&mut self, status: u8, device_activated: bool) { /// Enforce the device status state machine per the virtio spec: /// INIT -> ACKNOWLEDGE -> DRIVER -> FEATURES_OK -> DRIVER_OK /// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 @@ -181,7 +187,17 @@ impl VirtioPciCommonConfig { .iter() .any(|&(from, to)| self.driver_status == from && status == to) { - self.driver_status = status; + if !device_activated { + self.driver_status = status; + } else { + // If the device doesn't implement reset(), the device is left activated. + // Re-initialization against a still-live backend device MUST be rejected. + warn!( + "pci: rejecting device status transition {:#x} -> {:#x}: previous reset did \ + not complete successfully and device is still active", + self.driver_status, status + ); + } } else { warn!( "pci: invalid virtio device status transition: {:#x} -> {:#x}", @@ -443,29 +459,29 @@ mod tests { let dev = Arc::new(Mutex::new(DummyDevice::new())); // The config generation register is read only. - regs.write(CONFIG_GENERATION, &[0xaa], dev.clone()); + regs.write(CONFIG_GENERATION, &[0xaa], dev.clone(), false); let mut read_back = vec![0x00]; regs.read(CONFIG_GENERATION, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(DEVICE_FEATURE, &[1, 2, 3, 4], dev.clone()); + regs.write(DEVICE_FEATURE, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; regs.read(DEVICE_FEATURE, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0u32); // Feature select registers are read/write. - regs.write(DEVICE_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone()); + regs.write(DEVICE_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; regs.read(DEVICE_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); - regs.write(DRIVER_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone()); + regs.write(DRIVER_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; regs.read(DRIVER_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); // 'queue_select' can be read and written. - regs.write(QUEUE_SELECT, &[0xaa, 0x55], dev.clone()); + regs.write(QUEUE_SELECT, &[0xaa, 0x55], dev.clone(), false); let mut read_back = vec![0x00, 0x00]; regs.read(QUEUE_SELECT, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0xaa); @@ -477,12 +493,12 @@ mod tests { assert_eq!(read_back, [0xff, 0xff]); // Writing the MSI vector of an invalid `queue_select` does not have any effect. - regs.write(QUEUE_MSIX_VECTOR, &[0x12, 0x13], dev.clone()); + regs.write(QUEUE_MSIX_VECTOR, &[0x12, 0x13], dev.clone(), false); assert_eq!(read_back, [0xff, 0xff]); // Valid `queue_select` though should setup the corresponding MSI-X queue. - regs.write(QUEUE_SELECT, &[0x1, 0x0], dev.clone()); + regs.write(QUEUE_SELECT, &[0x1, 0x0], dev.clone(), false); assert_eq!(regs.queue_select, 1); - regs.write(QUEUE_MSIX_VECTOR, &[0x1, 0x0], dev.clone()); + regs.write(QUEUE_MSIX_VECTOR, &[0x1, 0x0], dev.clone(), false); regs.read(QUEUE_MSIX_VECTOR, &mut read_back, dev); assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1); } @@ -501,11 +517,21 @@ mod tests { config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1110); // select second page - config.write(DEVICE_FEATURE_SELECT, 1u32.as_slice(), device.clone()); + config.write( + DEVICE_FEATURE_SELECT, + 1u32.as_slice(), + device.clone(), + false, + ); config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1312); // Try a third page. It doesn't exist so we should get all 0s - config.write(DEVICE_FEATURE_SELECT, 2u32.as_slice(), device.clone()); + config.write( + DEVICE_FEATURE_SELECT, + 2u32.as_slice(), + device.clone(), + false, + ); config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x0); } @@ -520,11 +546,21 @@ mod tests { .set_avail_features(0x0000_1312_0000_1110); // ACK some features of the first page - config.write(DRIVER_FEATURE, 0x1100u32.as_slice(), device.clone()); + config.write(DRIVER_FEATURE, 0x1100u32.as_slice(), device.clone(), false); assert_eq!(device.lock().unwrap().acked_features(), 0x1100); // ACK some features of the second page - config.write(DRIVER_FEATURE_SELECT, 1u32.as_slice(), device.clone()); - config.write(DRIVER_FEATURE, 0x0000_1310u32.as_slice(), device.clone()); + config.write( + DRIVER_FEATURE_SELECT, + 1u32.as_slice(), + device.clone(), + false, + ); + config.write( + DRIVER_FEATURE, + 0x0000_1310u32.as_slice(), + device.clone(), + false, + ); assert_eq!( device.lock().unwrap().acked_features(), 0x0000_1310_0000_1100 @@ -540,7 +576,7 @@ mod tests { config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); // `num_queues` is read-only - config.write(NUM_QUEUES, 4u16.as_slice(), device.clone()); + config.write(NUM_QUEUES, 4u16.as_slice(), device.clone(), false); config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); } @@ -556,7 +592,7 @@ mod tests { assert_eq!(status, 0); // Valid state transitions - config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone()); + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone(), false); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, ACKNOWLEDGE); @@ -564,6 +600,7 @@ mod tests { DEVICE_STATUS, (ACKNOWLEDGE | DRIVER).as_slice(), device.clone(), + false, ); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, ACKNOWLEDGE | DRIVER); @@ -572,6 +609,7 @@ mod tests { DEVICE_STATUS, (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), device.clone(), + false, ); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK); @@ -580,12 +618,13 @@ mod tests { DEVICE_STATUS, (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK).as_slice(), device.clone(), + false, ); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); // Reset should always work - config.write(DEVICE_STATUS, INIT.as_slice(), device.clone()); + config.write(DEVICE_STATUS, INIT.as_slice(), device.clone(), true); config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, INIT); } @@ -597,7 +636,7 @@ mod tests { // Helper to attempt a transition and verify it was rejected. let mut assert_rejected = |config: &mut VirtioPciCommonConfig, new: u8, expected: u8| { - config.write(DEVICE_STATUS, new.as_slice(), device.clone()); + config.write(DEVICE_STATUS, new.as_slice(), device.clone(), false); let mut s = 0u8; config.read(DEVICE_STATUS, s.as_mut_slice(), device.clone()); assert_eq!(s, expected, "transition to {new:#x} should be rejected"); @@ -614,16 +653,18 @@ mod tests { assert_rejected(&mut config, 0x42, INIT); // Advance to ACKNOWLEDGE | DRIVER | FEATURES_OK - config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone()); + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone(), false); config.write( DEVICE_STATUS, (ACKNOWLEDGE | DRIVER).as_slice(), device.clone(), + false, ); config.write( DEVICE_STATUS, (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), device.clone(), + false, ); let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; @@ -637,6 +678,7 @@ mod tests { DEVICE_STATUS, (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), device.clone(), + false, ); let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; @@ -644,6 +686,32 @@ mod tests { assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, expected); } + #[test] + fn test_device_activated_blocks_transitions() { + let mut config = default_pci_common_config(); + let device = default_device(); + let mut status = 0u8; + + // Simulate a failed reset: driver_status is INIT but device is still activated. + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + + // Every transition should be rejected when device_activated is true at INIT. + for &value in &[ + ACKNOWLEDGE, + ACKNOWLEDGE | DRIVER, + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ] { + config.write(DEVICE_STATUS, value.as_slice(), device.clone(), true); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!( + status, INIT, + "transition to {value:#x} should be blocked while device is activated" + ); + } + } + #[test] fn test_config_msix_vector() { let mut config = default_pci_common_config(); @@ -653,13 +721,13 @@ mod tests { // Our device has 2 queues, so we should be using 3 vectors in total. // Trying to set a vector bigger than that should fail. Observing the // failure happens through a subsequent read that should return NO_VECTOR. - config.write(MSIX_CONFIG, 3u16.as_slice(), device.clone()); + config.write(MSIX_CONFIG, 3u16.as_slice(), device.clone(), false); config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for i in 0u16..3 { - config.write(MSIX_CONFIG, i.as_slice(), device.clone()); + config.write(MSIX_CONFIG, i.as_slice(), device.clone(), false); config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, i); } @@ -673,7 +741,7 @@ mod tests { let mut max_size = [0u16; 2]; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!( len, @@ -685,14 +753,14 @@ mod tests { // Before FEATURES_OK is set, the driver should not be able to change the queue size. config.driver_status = ACKNOWLEDGE | DRIVER; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone(), false); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[queue_id as usize]); } // Verify writing a queue size to a non-existent queue is ignored. - config.write(QUEUE_SELECT, 2u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, 2u16.as_slice(), device.clone(), false); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, 0); @@ -701,11 +769,12 @@ mod tests { // Setup size smaller than what is the maximum offered for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); config.write( QUEUE_SIZE, (max_size[queue_id as usize] - 1).as_slice(), device.clone(), + false, ); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[queue_id as usize] - 1); @@ -713,8 +782,8 @@ mod tests { // Verify writes are rejected after DRIVER_OK is set. config.driver_status |= DRIVER_OK; - config.write(QUEUE_SELECT, 0u16.as_slice(), device.clone()); - config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, 0u16.as_slice(), device.clone(), false); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone(), false); config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[0] - 1); } @@ -730,15 +799,20 @@ mod tests { // failure happens through a subsequent read that should return NO_VECTOR. for queue_id in 0u16..2 { // Select queue - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); - config.write(QUEUE_MSIX_VECTOR, 3u16.as_slice(), device.clone()); + config.write(QUEUE_MSIX_VECTOR, 3u16.as_slice(), device.clone(), false); config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for vector_id in 0u16..3 { - config.write(QUEUE_MSIX_VECTOR, vector_id.as_slice(), device.clone()); + config.write( + QUEUE_MSIX_VECTOR, + vector_id.as_slice(), + device.clone(), + false, + ); config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, vector_id); } @@ -753,7 +827,7 @@ mod tests { // Initially queue should be disabled for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 0); } @@ -761,8 +835,8 @@ mod tests { // Enabling a queue before FEATURES_OK should be ignored. config.driver_status = ACKNOWLEDGE | DRIVER; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone(), false); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 0); } @@ -770,13 +844,13 @@ mod tests { // Set FEATURES_OK so that the driver can enable the queue. config.driver_status |= FEATURES_OK; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone(), false); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); // The driver MUST NOT write a 0 to queue_enable. - config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone()); + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone(), false); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); } @@ -784,8 +858,8 @@ mod tests { // Verify writes are rejected after DRIVER_OK config.driver_status |= DRIVER_OK; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); - config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone(), false); config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); } @@ -802,12 +876,12 @@ mod tests { // a field setup by the device and should be read-only for the driver for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); // Writing to it should not have any effect - config.write(QUEUE_NOTIFY_OFF, 0x42.as_slice(), device.clone()); + config.write(QUEUE_NOTIFY_OFF, 0x42.as_slice(), device.clone(), false); config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); } @@ -822,8 +896,8 @@ mod tests { let lo32 = (value & 0xffff_ffff) as u32; let hi32 = (value >> 32) as u32; - config.write(offset, lo32.as_slice(), device.clone()); - config.write(offset + 4, hi32.as_slice(), device.clone()); + config.write(offset, lo32.as_slice(), device.clone(), false); + config.write(offset + 4, hi32.as_slice(), device.clone(), false); } fn read_64bit_field( @@ -848,7 +922,7 @@ mod tests { // Before FEATURES_OK is set, the driver should not be able to change the queue addresses. config.driver_status = ACKNOWLEDGE | DRIVER; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); @@ -859,7 +933,7 @@ mod tests { // Set status so queue fields can be modified config.driver_status |= FEATURES_OK; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); @@ -873,7 +947,7 @@ mod tests { // Verify writes are rejected after DRIVER_OK config.driver_status |= DRIVER_OK; for queue_id in 0u16..2 { - config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { write_64bit_field(&mut config, device.clone(), offset, 0xDEAD_BEEF); diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index c79ffca2e34..a7c78a1808d 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -434,7 +434,7 @@ impl VirtioPciDevice { vectors, )); - let virtio_pci_device = VirtioPciDevice { + let mut virtio_pci_device = VirtioPciDevice { id, pci_device_bdf: state.pci_device_bdf, configuration: pci_config, @@ -835,10 +835,12 @@ impl PciDevice for VirtioPciDevice { fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { - self.common_config - .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) - } + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + self.device_activated.load(Ordering::SeqCst), + ), o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { // We don't actually support legacy INT#x interrupts for VirtIO PCI devices warn!("pci: access to unsupported ISR status field"); @@ -919,9 +921,21 @@ impl PciDevice for VirtioPciDevice { } None => { error!("Attempt to reset device when not implemented in underlying device"); - // TODO: currently we don't support device resetting, but we still - // follow the spec and set the status field to 0. - self.common_config.driver_status = INIT; + // The virtio spec does not specify what to do if reset fails. + // + // Our MMIO transport sets FAILED in this case, but we must NOT do that for PCI. + // During shutdown, the Linux kernel issues a reset to each virtio device. The + // virtio PCI driver then polls device_status until it reads back 0, unlike the + // virtio MMIO driver which simply writes 0 and returns. Setting FAILED would + // cause the poll to spin forever, breaking reboot command and Ctrl-Alt-Del. + // - PCI: https://elixir.bootlin.com/linux/v6.19.8/source/drivers/virtio/virtio_pci_modern.c#L546-L565 + // - MMIO: https://elixir.bootlin.com/linux/v6.19.8/source/drivers/virtio/virtio_mmio.c#L251-L258 + // + // Since device_status was already set to INIT by set_device_status(), we don't + // need to set it again here. However, the backend device is still active since + // reset() is unimplemented. The combination of device_activated == true and + // device_status == INIT will cause set_device_status() to block any + // re-initialization attempts. } } } @@ -941,6 +955,7 @@ impl BusDevice for VirtioPciDevice { #[cfg(test)] mod tests { + use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use event_manager::MutEventSubscriber; @@ -1582,7 +1597,7 @@ mod tests { assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); write_driver_status(&mut locked_virtio_pci_device, ACKNOWLEDGE); @@ -1592,7 +1607,7 @@ mod tests { assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); let status = read_driver_status(&mut locked_virtio_pci_device); @@ -1615,7 +1630,7 @@ mod tests { assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); setup_queues(&mut locked_virtio_pci_device); @@ -1630,7 +1645,50 @@ mod tests { assert!( locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); } + + #[test] + fn test_failed_reset_blocks_reinitialization() { + let mut vmm = create_vmm_with_virtio_pci_device(); + let device = get_virtio_device(&vmm); + let mut locked = device.lock().unwrap(); + + // Full initialization sequence. + write_driver_status(&mut locked, ACKNOWLEDGE); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER); + let features = read_device_features(&mut locked); + write_driver_features(&mut locked, features); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK); + setup_queues(&mut locked); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + assert!(locked.device_activated.load(Ordering::SeqCst)); + + // Write 0 to device_status to request a reset. + // Entropy's reset() returns None (unimplemented), so the reset fails. + write_driver_status(&mut locked, 0); + assert_eq!(read_driver_status(&mut locked), 0); + // device_activated stays true because the backend was not actually reset. + assert!(locked.device_activated.load(Ordering::SeqCst)); + + // Attempt to re-initialize should be rejected because device_activated is + // still true while driver_status is INIT. + write_driver_status(&mut locked, ACKNOWLEDGE); + assert_eq!(read_driver_status(&mut locked), 0); + + // Save state and restore into a new device -- the combination of + // device_activated == true and driver_status == INIT is preserved in the + // snapshot, so the blocking behavior survives restore. + let saved_state = locked.state(); + drop(locked); + + let new_entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + let restored = + VirtioPciDevice::new_from_state("rng".to_string(), &vmm.vm, new_entropy, saved_state) + .unwrap(); + + assert!(restored.device_activated.load(Ordering::SeqCst)); + assert_eq!(restored.common_config.driver_status, 0); + } } From 92e61faa2f77326d258a450d4d4cfde84c97ca8e Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 12:41:42 +0000 Subject: [PATCH 32/53] fix(pci): Set DEVICE_NEEDS_RESET on activation failure The MMIO transport sets DEVICE_NEEDS_RESET [1] when device activation fails, but the PCI transport didn't. [1]: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 Signed-off-by: Takahiro Itazuri --- .../devices/virtio/transport/pci/device.rs | 24 +++++++++++++++++++ .../src/devices/virtio/transport/pci/mod.rs | 1 + 2 files changed, 25 insertions(+) diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index a7c78a1808d..5716b321fec 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -890,6 +890,7 @@ impl PciDevice for VirtioPciDevice { { Ok(()) => self.device_activated.store(true, Ordering::SeqCst), Err(err) => { + self.common_config.driver_status |= DEVICE_NEEDS_RESET; error!("Error activating device: {err:?}"); // Section 2.1.2 of the specification states that we need to send a device @@ -1649,6 +1650,29 @@ mod tests { ); } + #[test] + fn test_activate_failure_sets_needs_reset() { + // Verify that DEVICE_NEEDS_RESET is set in driver_status when device activation fails. + use crate::devices::virtio::transport::pci::device_status::DEVICE_NEEDS_RESET; + + let mut vmm = create_vmm_with_virtio_pci_device(); + let device = get_virtio_device(&vmm); + let mut locked = device.lock().unwrap(); + + // Drive through init without setting up queues, so activate() fails. + write_driver_status(&mut locked, ACKNOWLEDGE); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER); + let features = read_device_features(&mut locked); + write_driver_features(&mut locked, features); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK); + // Skip setup_queues() -- queues are not ready, so activate() will fail. + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + + assert!(!locked.device_activated.load(Ordering::SeqCst)); + let status = read_driver_status(&mut locked); + assert_eq!(status & DEVICE_NEEDS_RESET, DEVICE_NEEDS_RESET); + } + #[test] fn test_failed_reset_blocks_reinitialization() { let mut vmm = create_vmm_with_virtio_pci_device(); diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs index a5f9f54bcd8..fba4b8faaa8 100644 --- a/src/vmm/src/devices/virtio/transport/pci/mod.rs +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -15,6 +15,7 @@ pub(crate) mod device_status { pub const DRIVER: u8 = 0x02; pub const DRIVER_OK: u8 = 0x04; pub const FEATURES_OK: u8 = 0x08; + pub const DEVICE_NEEDS_RESET: u8 = 0x40; pub const FAILED: u8 = 0x80; } From 1152982306be074c941799cada0ad37837213aaf Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Thu, 19 Mar 2026 13:00:22 +0000 Subject: [PATCH 33/53] fix(pci): Check device status for feature negotiation The MMIO transport rejects writes to the driver feature register outsidfe of the DRIVER state, but the PCI transport accepted them unconditionally. Add the same guard: only call ack_features_by_page() when DRIVER Is set and FEATURES_OK, FAILED, and DEVICE_NEEDS_RESET are all clear. Signed-off-by: Takahiro Itazuri --- .../virtio/transport/pci/common_config.rs | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index e0fd832bc57..b5ee2a2fed4 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -381,7 +381,18 @@ impl VirtioPciCommonConfig { match offset { DEVICE_FEATURE_SELECT => self.device_feature_select = value, DRIVER_FEATURE_SELECT => self.driver_feature_select = value, - DRIVER_FEATURE => locked_device.ack_features_by_page(self.driver_feature_select, value), + DRIVER_FEATURE => { + // Feature negotiation is only allowed in DRIVER state. + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + if self.driver_status == (ACKNOWLEDGE | DRIVER) { + locked_device.ack_features_by_page(self.driver_feature_select, value); + } else { + warn!( + "pci: feature negotiation not allowed in device state {:#x}", + self.driver_status + ); + } + } QUEUE_DESC_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.desc_table_address, value) }), @@ -545,6 +556,10 @@ mod tests { .unwrap() .set_avail_features(0x0000_1312_0000_1110); + // Feature negotiation requires DRIVER state (ACKNOWLEDGE | DRIVER). + config.set_device_status(ACKNOWLEDGE, false); + config.set_device_status(ACKNOWLEDGE | DRIVER, false); + // ACK some features of the first page config.write(DRIVER_FEATURE, 0x1100u32.as_slice(), device.clone(), false); assert_eq!(device.lock().unwrap().acked_features(), 0x1100); @@ -565,6 +580,25 @@ mod tests { device.lock().unwrap().acked_features(), 0x0000_1310_0000_1100 ); + + // After FEATURES_OK, further feature writes should be rejected. + config.set_device_status(ACKNOWLEDGE | DRIVER | FEATURES_OK, false); + config.write( + DRIVER_FEATURE_SELECT, + 0u32.as_slice(), + device.clone(), + false, + ); + config.write( + DRIVER_FEATURE, + 0xFFFF_FFFFu32.as_slice(), + device.clone(), + false, + ); + assert_eq!( + device.lock().unwrap().acked_features(), + 0x0000_1310_0000_1100 + ); } #[test] From a213b3d1c965642b2eb6ef3aaa78420623a2593a Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 20 Mar 2026 05:57:46 +0000 Subject: [PATCH 34/53] docs: Add CHANGELOG entries for virtio transport fixes Add entries for the virtio transport fixes introduced in the preceding commits. Signed-off-by: Takahiro Itazuri --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bd13118b81..4378e67552a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,16 @@ and this project adheres to a descriptor chain that caused Firecracker to allocate more host memory than the guest actually provided, potentially leading to excessive host memory consumption. +- [#XXXX](https://github.com/firecracker-microvm/firecracker/pull/XXXX): Enforce + the virtio device initialization sequence in the PCI transport, matching the + existing MMIO transport behavior. The PCI transport now validates device + status transitions, rejects queue configuration writes outside the FEATURES_OK + to DRIVER_OK window, rejects feature negotiation outside the DRIVER state, + blocks re-initialization after a failed reset, and sets DEVICE_NEEDS_RESET + when device activation fails. +- [#XXXX](https://github.com/firecracker-microvm/firecracker/pull/XXXX): Reject + device status writes that clear previously set bits in the MMIO transport, + except for reset. ## [1.14.3] From 1dd26ef8ac9c65e6e2fe802510078e68595aaf61 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 20 Mar 2026 16:26:13 +0000 Subject: [PATCH 35/53] fix(aarch64): override fabricated CLIDR_EL1 to match host cache topology Since host kernel 6.3 (commit 7af0c2534f4c), KVM fabricates CLIDR_EL1 instead of passing through the host's real value. On hosts with IDC=1 and DIC=0 (e.g. Neoverse V1), the fabricated CLIDR exposes only L1=Unified when the host actually has separate L1d+L1i, L2, and L3. Guest kernels >= 6.1.156 backported init_of_cache_level() which counts cache leaves from the DT, while populate_cache_leaves() uses CLIDR_EL1. When the DT (built from host sysfs) describes more cache entries than CLIDR_EL1, the mismatch causes cache sysfs entries to not be created, breaking /sys/devices/system/cpu/cpu*/cache/* in the guest. Fix this by reading the current CLIDR_EL1 from vCPU 0, merging in the ctype and LoC fields derived from the host's sysfs cache topology, and writing the result back to each vCPU via KVM_SET_ONE_REG. Fields that cannot be derived from sysfs (LoUU, LoUIS, ICB, Ttype) are preserved from the original CLIDR_EL1. This makes CLIDR_EL1 consistent with the FDT, which already describes the real host caches. On pre-6.3 kernels, KVM passes through the real host CLIDR rather than fabricating one. Since the sysfs cache topology already matches the real CLIDR, the merge produces the same value, the write is skipped, and the override is effectively a no-op. This approach preserves the full host cache information for the guest rather than stripping the FDT to match the fabricated CLIDR. Signed-off-by: Nikita Kalyazin --- CHANGELOG.md | 3 + src/vmm/src/arch/aarch64/cache_info.rs | 202 ++++++++++++++++++++++++- src/vmm/src/arch/aarch64/mod.rs | 73 +++++++++ src/vmm/src/arch/aarch64/regs.rs | 4 + 4 files changed, 279 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4378e67552a..3832542288d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ and this project adheres to - [#XXXX](https://github.com/firecracker-microvm/firecracker/pull/XXXX): Reject device status writes that clear previously set bits in the MMIO transport, except for reset. +- [#5780](https://github.com/firecracker-microvm/firecracker/pull/5780): Fixed + missing `/sys/devices/system/cpu/cpu*/cache/*` in aarch64 guests when running + on host kernels >= 6.3 with guest kernels >= 6.1.156. ## [1.14.3] diff --git a/src/vmm/src/arch/aarch64/cache_info.rs b/src/vmm/src/arch/aarch64/cache_info.rs index 8f8611fe440..4c934626f1b 100644 --- a/src/vmm/src/arch/aarch64/cache_info.rs +++ b/src/vmm/src/arch/aarch64/cache_info.rs @@ -10,7 +10,7 @@ use crate::logger::warn; const MAX_CACHE_LEVEL: u8 = 7; #[derive(Debug, thiserror::Error, displaydoc::Display)] -pub(crate) enum CacheInfoError { +pub enum CacheInfoError { /// Failed to read cache information: {0} FailedToReadCacheInfo(#[from] io::Error), /// Invalid cache configuration found for {0}: {1} @@ -32,7 +32,7 @@ trait CacheStore: std::fmt::Debug { } #[derive(Debug)] -pub(crate) struct CacheEntry { +pub struct CacheEntry { // Cache Level: 1, 2, 3.. pub level: u8, // Type of cache: Unified, Data, Instruction. @@ -154,7 +154,7 @@ impl Default for CacheEntry { #[derive(Debug)] // Based on https://elixir.free-electrons.com/linux/v4.9.62/source/include/linux/cacheinfo.h#L11. -pub(crate) enum CacheType { +pub enum CacheType { Instruction, Data, Unified, @@ -314,6 +314,105 @@ pub(crate) fn read_cache_config( Ok(()) } +// CLIDR_EL1 field positions +// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CLIDR-EL1--Cache-Level-ID-Register +const CLIDR_CTYPE_SHIFT: u8 = 3; // Each Ctype field is 3 bits +const CLIDR_LOC_SHIFT: u8 = 24; + +// CLIDR_EL1 Ctype field values +const CLIDR_CTYPE_NO_CACHE: u64 = 0; +const CLIDR_CTYPE_INSTRUCTION: u64 = 1; +const CLIDR_CTYPE_DATA: u64 = 2; +const CLIDR_CTYPE_SEPARATE: u64 = 3; +const CLIDR_CTYPE_UNIFIED: u64 = 4; + +/// Classify a set of cache entries at the same level into a CLIDR Ctype value. +fn ctype_for_entries<'a>(entries: impl Iterator) -> u64 { + let (mut has_data, mut has_inst, mut has_unified) = (false, false, false); + let mut any = false; + for c in entries { + any = true; + match c.type_ { + CacheType::Data => has_data = true, + CacheType::Instruction => has_inst = true, + CacheType::Unified => has_unified = true, + } + } + if !any { + return CLIDR_CTYPE_NO_CACHE; + } + if has_unified { + CLIDR_CTYPE_UNIFIED + } else if has_data && has_inst { + CLIDR_CTYPE_SEPARATE + } else if has_data { + CLIDR_CTYPE_DATA + } else if has_inst { + CLIDR_CTYPE_INSTRUCTION + } else { + CLIDR_CTYPE_NO_CACHE + } +} + +/// Build a CLIDR_EL1 value from the host's cache topology read from sysfs. +/// +/// Since host kernel 6.3 (commit 7af0c2534f4c), KVM fabricates CLIDR_EL1 to +/// expose a different cache topology than the host. Guest kernels >= 6.1.156 +/// backported `init_of_cache_level()` which counts cache leaves from the DT, +/// while `populate_cache_leaves()` uses CLIDR_EL1. If the DT (built from +/// sysfs) describes different cache entries than CLIDR_EL1, the mismatch +/// causes cache sysfs entries to not be created in the guest. +/// +/// This function builds a CLIDR_EL1 value that matches the host's real cache +/// topology so it can be written to each vCPU, making CLIDR_EL1 consistent +/// with the FDT. +pub(crate) fn build_clidr_from_caches( + l1_caches: &[CacheEntry], + non_l1_caches: &[CacheEntry], +) -> u64 { + let mut clidr: u64 = 0; + let mut max_level: u8 = 0; + + let l1_ctype = ctype_for_entries(l1_caches.iter()); + if l1_ctype != CLIDR_CTYPE_NO_CACHE { + clidr |= l1_ctype; + max_level = 1; + } + + for level in 2..=MAX_CACHE_LEVEL { + let ctype = ctype_for_entries(non_l1_caches.iter().filter(|c| c.level == level)); + if ctype == CLIDR_CTYPE_NO_CACHE { + break; + } + + let shift = CLIDR_CTYPE_SHIFT * (level - 1); + clidr |= ctype << shift; + max_level = level; + } + + // Set LoC (Level of Coherence) to the highest cache level + clidr |= u64::from(max_level) << CLIDR_LOC_SHIFT; + + clidr +} + +/// Merge sysfs-derived ctype/LoC fields into an existing CLIDR_EL1 value, +/// preserving LoUU, LoUIS, ICB, and Ttype fields from the original. +/// +/// This ensures that on pre-6.3 kernels (where CLIDR already matches sysfs), +/// the write is effectively a no-op, and fields we can't derive from sysfs +/// (like LoUU, LoUIS, ICB) are never clobbered. +pub(crate) fn merge_clidr(current: u64, sysfs: u64) -> u64 { + // Ctype fields: bits [20:0] (7 levels × 3 bits each = 21 bits) + // LoC field: bits [26:24] + // We replace only these fields from sysfs, preserving LoUIS [23:21], + // LoUU [29:27], ICB [32:30], and Ttype [46:33] from the original. + const CTYPE_MASK: u64 = 0x001F_FFFF; // bits [20:0] + const LOC_MASK: u64 = 0x0700_0000; // bits [26:24] + const REPLACE_MASK: u64 = CTYPE_MASK | LOC_MASK; + (current & !REPLACE_MASK) | (sysfs & REPLACE_MASK) +} + #[cfg(test)] mod tests { use std::collections::HashMap; @@ -576,4 +675,101 @@ mod tests { assert_eq!(l1_caches.len(), 2); assert_eq!(l1_caches.len(), 2); } + + #[test] + fn test_build_clidr_from_caches() { + // L1 Separate (Data + Instruction) + L2 Unified + L3 Unified + let l1 = vec![ + CacheEntry { + level: 1, + type_: CacheType::Data, + ..CacheEntry::default() + }, + CacheEntry { + level: 1, + type_: CacheType::Instruction, + ..CacheEntry::default() + }, + ]; + let non_l1 = vec![ + CacheEntry { + level: 2, + type_: CacheType::Unified, + ..CacheEntry::default() + }, + CacheEntry { + level: 3, + type_: CacheType::Unified, + ..CacheEntry::default() + }, + ]; + let clidr = build_clidr_from_caches(&l1, &non_l1); + // ctype1=3 (Separate), ctype2=4 (Unified), ctype3=4 (Unified), LoC=3 + assert_eq!(clidr & 0x7, 3, "L1 should be Separate"); + assert_eq!((clidr >> 3) & 0x7, 4, "L2 should be Unified"); + assert_eq!((clidr >> 6) & 0x7, 4, "L3 should be Unified"); + assert_eq!((clidr >> 24) & 0x7, 3, "LoC should be 3"); + + // L1 Unified only (no higher levels) + let l1_unified = vec![CacheEntry { + level: 1, + type_: CacheType::Unified, + ..CacheEntry::default() + }]; + let clidr = build_clidr_from_caches(&l1_unified, &[]); + assert_eq!(clidr & 0x7, 4, "L1 should be Unified"); + assert_eq!((clidr >> 3) & 0x7, 0, "L2 should be NoCache"); + assert_eq!((clidr >> 24) & 0x7, 1, "LoC should be 1"); + + // No caches at all + let clidr = build_clidr_from_caches(&[], &[]); + assert_eq!(clidr, 0, "Empty caches should produce CLIDR=0"); + + // Mock store default: L1 Data + L1 Instruction + L2 Unified + let mut l1_mock: Vec = Vec::new(); + let mut non_l1_mock: Vec = Vec::new(); + read_cache_config(&mut l1_mock, &mut non_l1_mock).unwrap(); + let clidr = build_clidr_from_caches(&l1_mock, &non_l1_mock); + assert_eq!(clidr & 0x7, 3, "Mock L1 should be Separate"); + assert_eq!((clidr >> 3) & 0x7, 4, "Mock L2 should be Unified"); + assert_eq!((clidr >> 24) & 0x7, 2, "Mock LoC should be 2"); + } + + #[test] + fn test_merge_clidr() { + // CLIDR_EL1 layout: + // [20:0] Ctype1..Ctype7 (7 × 3 bits) + // [23:21] LoUIS + // [26:24] LoC + // [29:27] LoUU + // [32:30] ICB + // [46:33] Ttype1..Ttype7 + // + // merge_clidr replaces only Ctype [20:0] and LoC [26:24] from sysfs, + // preserving LoUIS, LoUU, ICB, and Ttype from current. + + // current: LoUU=2 [29:27], LoUIS=1 [23:21], ICB=1 [32:30] + // Ctype1=Unified(4) [2:0], LoC=1 [26:24] + let current: u64 = (1 << 30) // ICB=1 + | (2 << 27) // LoUU=2 + | (1 << 24) // LoC=1 + | (1 << 21) // LoUIS=1 + | 4; // Ctype1=Unified + // sysfs: Ctype1=Separate(3), Ctype2=Unified(4), Ctype3=Unified(4), LoC=3 + let sysfs: u64 = (3 << 24) | (4 << 6) | (4 << 3) | 3; + let merged = merge_clidr(current, sysfs); + + // Ctype and LoC should come from sysfs + assert_eq!(merged & 0x001F_FFFF, sysfs & 0x001F_FFFF, "Ctype mismatch"); + assert_eq!((merged >> 24) & 0x7, 3, "LoC should be 3 from sysfs"); + // LoUIS, LoUU, ICB should be preserved from current + assert_eq!((merged >> 21) & 0x7, 1, "LoUIS should be preserved"); + assert_eq!((merged >> 27) & 0x7, 2, "LoUU should be preserved"); + assert_eq!((merged >> 30) & 0x7, 1, "ICB should be preserved"); + + // When current == sysfs in the replaced region, merge is identity + let current = 0x0000_0000_0300_0123_u64; + let sysfs = 0x0000_0000_0300_0123_u64; + assert_eq!(merge_clidr(current, sysfs), current); + } } diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 4e82a7d3d56..e300499799c 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -23,11 +23,13 @@ use std::fs::File; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::{GuestMemoryError, GuestMemoryRegion}; +use zerocopy::IntoBytes; use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; +use crate::logger::warn; use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ @@ -51,6 +53,8 @@ pub enum ConfigurationError { VcpuConfig(#[from] CpuConfigurationError), /// Error configuring the vcpu: {0} VcpuConfigure(#[from] KvmVcpuError), + /// Failed to read host cache information: {0} + CacheInfo(#[from] cache_info::CacheInfoError), } /// Returns a Vec of the valid memory addresses for aarch64. @@ -118,6 +122,11 @@ pub fn configure_system_for_boot( &optional_capabilities, )?; } + + // Override CLIDR_EL1 ctype/LoC fields on each vCPU to match the host's + // real cache topology. See `override_clidr` for details. + override_clidr(vcpus)?; + let vcpu_mpidr = vcpus .iter_mut() .map(|cpu| cpu.kvm_vcpu.get_mpidr()) @@ -142,6 +151,70 @@ pub fn configure_system_for_boot( Ok(()) } +/// Override CLIDR_EL1 ctype/LoC fields on each vCPU to match the host's real +/// cache topology. +/// +/// Since host kernel 6.3 (commit 7af0c2534f4c), KVM fabricates CLIDR_EL1 +/// instead of passing through the host's real value. This can cause the guest +/// to see fewer cache levels than actually exist. Guest kernels >= 6.1.156 +/// backported `init_of_cache_level()` which counts cache leaves from the DT, +/// while `populate_cache_leaves()` uses CLIDR_EL1. If the DT (built from host +/// sysfs) describes different cache entries than CLIDR_EL1, the mismatch +/// causes cache sysfs entries to not be created. +/// +/// We read the current (possibly fabricated) CLIDR_EL1, replace only the ctype +/// and LoC fields with values derived from sysfs, and preserve all other fields +/// (LoUU, LoUIS, ICB, Ttype). This is safe on pre-6.3 kernels where CLIDR +/// already matches sysfs — the write is skipped as a no-op. +fn override_clidr(vcpus: &[Vcpu]) -> Result<(), ConfigurationError> { + let mut l1_caches = Vec::new(); + let mut non_l1_caches = Vec::new(); + cache_info::read_cache_config(&mut l1_caches, &mut non_l1_caches)?; + + // If sysfs reports no L1 caches, we cannot build a meaningful CLIDR. + // Writing an all-zero CLIDR would tell the guest there are no caches, + // which is worse than whatever KVM fabricated. Leave it alone. + if l1_caches.is_empty() { + warn!("No L1 caches found in sysfs, skipping CLIDR override"); + return Ok(()); + } + + let sysfs_clidr = cache_info::build_clidr_from_caches(&l1_caches, &non_l1_caches); + + let mut cur_clidr: u64 = 0; + // Reading/writing CLIDR_EL1 via KVM_SET_ONE_REG may not be supported on + // older kernels (pre-6.3). In that case KVM passes through the real host + // CLIDR and the override is unnecessary, so we warn and continue. + if let Err(e) = vcpus[0] + .kvm_vcpu + .fd + .get_one_reg(regs::CLIDR_EL1, cur_clidr.as_mut_bytes()) + { + warn!("Failed to read CLIDR_EL1, skipping override: {e}"); + return Ok(()); + } + + let new_clidr = cache_info::merge_clidr(cur_clidr, sysfs_clidr); + + if new_clidr != cur_clidr { + for vcpu in vcpus.iter() { + if let Err(e) = vcpu + .kvm_vcpu + .fd + .set_one_reg(regs::CLIDR_EL1, new_clidr.as_bytes()) + { + warn!( + "Failed to set CLIDR_EL1 to {:#x} on vCPU {}, skipping override: {e}", + new_clidr, vcpu.kvm_vcpu.index + ); + return Ok(()); + } + } + } + + Ok(()) +} + /// Returns the memory address where the kernel could be loaded. pub fn get_kernel_start() -> u64 { layout::SYSTEM_MEM_START + layout::SYSTEM_MEM_SIZE diff --git a/src/vmm/src/arch/aarch64/regs.rs b/src/vmm/src/arch/aarch64/regs.rs index 7a24337e5c0..abcde5914b8 100644 --- a/src/vmm/src/arch/aarch64/regs.rs +++ b/src/vmm/src/arch/aarch64/regs.rs @@ -90,6 +90,10 @@ arm64_sys_reg!(ID_AA64ISAR0_EL1, 3, 0, 0, 6, 0); arm64_sys_reg!(ID_AA64ISAR1_EL1, 3, 0, 0, 6, 1); arm64_sys_reg!(ID_AA64MMFR2_EL1, 3, 0, 0, 7, 2); +// Cache Level ID Register +// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CLIDR-EL1--Cache-Level-ID-Register +arm64_sys_reg!(CLIDR_EL1, 3, 1, 0, 0, 1); + // Counter-timer Virtual Timer CompareValue register. // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CNTV-CVAL-EL0--Counter-timer-Virtual-Timer-CompareValue-register // https://elixir.bootlin.com/linux/v6.8/source/arch/arm64/include/asm/sysreg.h#L468 From 4acbb53adca130e50b139ce7a8ce5739943713a8 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 25 Mar 2026 17:44:43 +0000 Subject: [PATCH 36/53] fix(virtio-mem): interval intersection in slots_intersecting_range The previous implementation checked whether either slot endpoint fell inside the requested range. This missed the containment case where a slot fully contains the range (neither endpoint inside it), causing update_kvm_slots to silently skip KVM slot registration/removal for any block not aligned to a slot boundary. Replace the two addr_in_range endpoint checks with a proper half-open interval intersection test: slot_start < range_end && range_start < slot_end. Remove the now-unused addr_in_range helper and add a table-driven unit test covering boundary, interior, cross-slot, full-region, outside, and zero-length ranges. Signed-off-by: Nikita Kalyazin --- CHANGELOG.md | 5 +++ src/vmm/src/vstate/memory.rs | 77 ++++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3832542288d..ab031a95570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,11 @@ and this project adheres to - [#5780](https://github.com/firecracker-microvm/firecracker/pull/5780): Fixed missing `/sys/devices/system/cpu/cpu*/cache/*` in aarch64 guests when running on host kernels >= 6.3 with guest kernels >= 6.1.156. +- [#5793](https://github.com/firecracker-microvm/firecracker/pull/5793): Fixed + virtio-mem plug/unplug skipping KVM slot updates for memory blocks not aligned + to a slot boundary. On plug, this could leave hotplugged memory inaccessible + to the guest. On unplug, the guest could retain access to memory that + Firecracker considered freed. ## [1.14.3] diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 9b62152c4b8..846ed9d2688 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -235,14 +235,6 @@ impl<'a> GuestMemorySlot<'a> { } } -fn addr_in_range(addr: GuestAddress, start: GuestAddress, len: usize) -> bool { - if let Some(end) = start.checked_add(len as u64) { - addr >= start && addr < end - } else { - false - } -} - impl GuestRegionMmapExt { /// Adds a DRAM region which only contains a single plugged slot pub(crate) fn dram_from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { @@ -345,11 +337,17 @@ impl GuestRegionMmapExt { len: usize, ) -> impl Iterator> { self.slots().map(|(slot, _)| slot).filter(move |slot| { - if let Some(slot_end) = slot.guest_addr.checked_add(slot.slice.len() as u64) { - addr_in_range(slot.guest_addr, from, len) || addr_in_range(slot_end, from, len) - } else { - false - } + // Two intervals [a, b) and [c, d) intersect iff a < d && c < b. + // This correctly handles the containment case where the slot fully + // contains the range (or vice versa). + let slot_start = slot.guest_addr; + let Some(slot_end) = slot_start.checked_add(slot.slice.len() as u64) else { + return false; + }; + let Some(range_end) = from.checked_add(len as u64) else { + return false; + }; + slot_start < range_end && from < slot_end }) } @@ -1461,4 +1459,57 @@ mod tests { GuestMemoryError::IOError(_) ); } + + /// Verifies that `slots_intersecting_range` returns the correct slots for + /// ranges at slot boundaries, interior to a slot, and spanning two slots. + #[test] + fn test_slots_intersecting_range() { + let page_size = get_page_size().unwrap(); + let slot_size = 4 * page_size; + let region_size = 2 * slot_size; + let base = GuestAddress(0); + let slot1_base = base.unchecked_add(slot_size as u64); + + let mmap_region = anonymous( + std::iter::once((base, region_size)), + false, + HugePageConfig::None, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + let region = GuestRegionMmapExt::hotpluggable_from_mmap_region(mmap_region, 0, slot_size); + assert_eq!(region.slot_cnt(), 2); + + // (range_offset_in_pages, range_len_in_pages, expected_slot_addrs) + let cases: &[(usize, usize, &[GuestAddress])] = &[ + // At slot 0 boundary + (0, 1, &[base]), + // Interior to slot 0 + (1, 1, &[base]), + // Interior to slot 1 + (5, 1, &[slot1_base]), + // Spanning slot 0 and slot 1 + (3, 2, &[base, slot1_base]), + // Entire region + (0, 8, &[base, slot1_base]), + // Outside the region + (8, 1, &[]), + // Zero-length range + (0, 0, &[]), + ]; + + for &(offset_pages, len_pages, expected) in cases { + let from = base.unchecked_add((offset_pages * page_size) as u64); + let len = len_pages * page_size; + let found: Vec<_> = region.slots_intersecting_range(from, len).collect(); + let addrs: Vec<_> = found.iter().map(|s| s.guest_addr).collect(); + assert_eq!( + addrs, expected, + "offset={offset_pages} pages, len={len_pages} pages" + ); + } + } } From fd18613c5993b63b24ea1eb5c1c9336d41885192 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 25 Mar 2026 18:34:49 +0000 Subject: [PATCH 37/53] fix(balloon): bound stats descriptor length process_stats_queue() used the guest-provided descriptor len field as the loop bound without validation. A misbehaving guest could set this to u32::MAX, causing excessive iterations that temporarily monopolise the VMM event loop. Add a MAX_STATS_DESC_LEN check before entering the loop. The limit uses a generous upper bound (256 tags) rather than the current spec count, so future kernel additions won't silently break stats collection. Oversized descriptors are logged and held without updating stats, preserving the stats request/response protocol. Signed-off-by: Nikita Kalyazin --- CHANGELOG.md | 4 + src/vmm/src/devices/virtio/balloon/device.rs | 92 ++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab031a95570..5eb8ea516a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,10 @@ and this project adheres to to a slot boundary. On plug, this could leave hotplugged memory inaccessible to the guest. On unplug, the guest could retain access to memory that Firecracker considered freed. +- [#5794](https://github.com/firecracker-microvm/firecracker/pull/5794): Bound + balloon statistics descriptor length to prevent a guest-controlled oversized + descriptor from temporarily stalling the VMM event loop. Only affects microVMs + with `stats_polling_interval_s > 0`. ## [1.14.3] diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 4d83075fa0f..33538a9c6a3 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -42,6 +42,16 @@ use crate::{impl_device_type, mem_size_mib}; const SIZE_OF_U32: usize = std::mem::size_of::(); const SIZE_OF_STAT: usize = std::mem::size_of::(); +/// Upper bound on the number of stats tags a guest may report. +/// The VirtIO spec currently defines 16, but newer kernel versions can +/// add more (e.g. Linux 6.12 added several, see 74c025c5d7e4). We use a +/// generous limit that still bounds computation without breaking on future +/// kernels. +const MAX_STATS_TAGS: u32 = 256; +/// Maximum valid stats descriptor length in bytes. +/// Descriptors exceeding this are rejected to prevent unbounded iteration. +#[allow(clippy::cast_possible_truncation)] +const MAX_STATS_DESC_LEN: u32 = MAX_STATS_TAGS * std::mem::size_of::() as u32; fn mib_to_pages(amount_mib: u32) -> Result { amount_mib @@ -492,6 +502,21 @@ impl Balloon { error!("balloon: driver is not compliant, more than one stats buffer received"); self.queues[STATS_INDEX].add_used(prev_stats_desc, 0)?; } + + // Reject oversized descriptors to prevent a guest from causing + // excessive iteration on the VMM event loop. + // We still hold onto the descriptor (via stats_desc_index below) + // so that the stats request/response protocol is preserved and + // trigger_stats_update can return it to the guest later. + if head.len > MAX_STATS_DESC_LEN { + warn!( + "balloon: stats descriptor too large: {} > {}, skipping", + head.len, MAX_STATS_DESC_LEN + ); + self.stats_desc_index = Some(head.index); + continue; + } + for index in (0..head.len).step_by(SIZE_OF_STAT) { // Read the address at position `index`. The only case // in which this fails is if there is overflow, @@ -1952,4 +1977,71 @@ pub(crate) mod tests { assert_eq!(balloon.num_pages(), 0x1122_3344); assert_eq!(balloon.actual_pages(), 0x1234_5678); } + + /// Test that process_stats_queue holds oversized descriptors without + /// updating stats, and updates stats for valid-length ones. + #[test] + fn test_stats_queue_oversized_descriptor_rejected() { + struct TestCase { + desc_len: u32, + stats_updated: bool, + } + + let cases = [ + TestCase { + desc_len: MAX_STATS_DESC_LEN + 1, + stats_updated: false, + }, + TestCase { + desc_len: MAX_STATS_DESC_LEN, + stats_updated: true, + }, + ]; + + let stat_addr: u64 = 0x1000; + + for tc in &cases { + let mut balloon = Balloon::new(0, true, 1, false, false).unwrap(); + let mem = default_mem(); + let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); + balloon.set_queue(INFLATE_INDEX, statsq.create_queue()); + balloon.set_queue(DEFLATE_INDEX, statsq.create_queue()); + balloon.set_queue(STATS_INDEX, statsq.create_queue()); + balloon.activate(mem.clone(), default_interrupt()).unwrap(); + + // Fill the descriptor region with a recognisable stat value. + let n_stats = tc.desc_len as usize / SIZE_OF_STAT; + for i in 0..n_stats { + mem.write_obj::( + BalloonStat { + tag: VIRTIO_BALLOON_S_MEMFREE, + val: 0xBEEF, + }, + GuestAddress(stat_addr + (i * SIZE_OF_STAT) as u64), + ) + .unwrap(); + } + + set_request(&statsq, 0, stat_addr, tc.desc_len, VIRTQ_DESC_F_NEXT); + balloon.queue_events()[STATS_INDEX].write(1).unwrap(); + balloon.process_stats_queue_event().unwrap(); + + // The descriptor should always be held (stats protocol preserved) + // regardless of whether the stats were updated. + assert!( + balloon.stats_desc_index.is_some(), + "desc_len={}: descriptor should be held", + tc.desc_len, + ); + + // Verify stats were only updated for valid descriptors. + assert_eq!( + balloon.latest_stats.free_memory.is_some(), + tc.stats_updated, + "desc_len={}: expected stats_updated={}", + tc.desc_len, + tc.stats_updated, + ); + } + } } From 4e860cbee3dd31bd583edb585e4157c0ef533ec0 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 26 Mar 2026 12:21:56 +0000 Subject: [PATCH 38/53] fix(balloon): make duplicate stats buffer visible to guest When a non-compliant driver submits more than one stats buffer, process_stats_queue returns the previous descriptor via add_used but never calls advance_used_ring_idx or signal_used_queue. The write to the used ring is therefore invisible to the guest, which can never reclaim the buffer. Add the missing advance_used_ring_idx and signal_used_queue calls so the guest actually sees the returned descriptor. Signed-off-by: Nikita Kalyazin --- src/vmm/src/devices/virtio/balloon/device.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 33538a9c6a3..a5095b44f67 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -501,6 +501,8 @@ impl Balloon { // the protocol, but return it if we find one. error!("balloon: driver is not compliant, more than one stats buffer received"); self.queues[STATS_INDEX].add_used(prev_stats_desc, 0)?; + self.queues[STATS_INDEX].advance_used_ring_idx(); + self.signal_used_queue(STATS_INDEX)?; } // Reject oversized descriptors to prevent a guest from causing From cc4bef8b2faa6ed2b5a554a2a157bf5f72cc4551 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 2 Apr 2026 11:09:06 +0100 Subject: [PATCH 39/53] fix(kvm-clock): do not jump monotonic clock on restore Firecracker has never advanced the clock on restore for any of the supported clocksources. Since Linux 5.16, the KVM_CLOCK_REALTIME has been passed to kvm-clock, causing the monotonic time in the guest to jump when using kvm-clock as clock source. Despite being unexpected and not what Firecracker should do, we recognize this may be a valid usecase so this patch adds a way to configure it, keeping the default to the expected documented behaviour. This patch adds a new API flag to LoadSnapshot, clock_realtime, that advances the clock on restore when set (default is False). Rather than the clock flags being decided at snapshot time, the restore path ignores those flags and decides what to do depending on the clock_realtime flag. This is because the other available flag (KVM_CLOCK_TSC_STABLE) cannot even be passed to `set_clock`, meaning the only valid flag is KVM_CLOCK_REALTIME. The name of the flag was kept generic as we may add this behaviour for the other clock sources in the future, if the need arises. Signed-off-by: Riccardo Mancini --- CHANGELOG.md | 6 ++ docs/snapshotting/snapshot-support.md | 5 + .../src/api_server/request/snapshot.rs | 6 ++ src/firecracker/swagger/firecracker.yaml | 8 ++ src/vmm/src/arch/x86_64/vm.rs | 84 ++++++++++++--- src/vmm/src/builder.rs | 8 +- src/vmm/src/persist.rs | 1 + src/vmm/src/rpc_interface.rs | 1 + src/vmm/src/vmm_config/snapshot.rs | 7 ++ src/vmm/src/vstate/vm.rs | 2 +- src/vmm/tests/integration_tests.rs | 2 + tests/framework/microvm.py | 13 ++- .../functional/test_snapshot_basic.py | 102 ++++++++++++++++++ 13 files changed, 226 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5eb8ea516a1..c513dd1f60a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,12 @@ and this project adheres to balloon statistics descriptor length to prevent a guest-controlled oversized descriptor from temporarily stalling the VMM event loop. Only affects microVMs with `stats_polling_interval_s > 0`. +- [#5809](https://github.com/firecracker-microvm/firecracker/pull/5809): Fixed a + bug on host Linux >= 5.16 for x86_64 guests using the `kvm-clock` clock source + causing the monotonic clock to jump on restore by the wall-clock time elapsed + since the snapshot was taken. Users using `kvm-clock` that want to explicitly + advance the clock with `KVM_CLOCK_REALTIME` can opt back in using the new + `clock_realtime` flag in `LoadSnapshot` API. ## [1.14.3] diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md index 6e1ac4d4c35..b1b485de5f8 100644 --- a/docs/snapshotting/snapshot-support.md +++ b/docs/snapshotting/snapshot-support.md @@ -492,6 +492,11 @@ resumed with the guest OS wall-clock continuing from the moment of the snapshot creation. For this reason, the wall-clock should be updated to the current time, on the guest-side. More details on how you could do this can be found at a [related FAQ](../../FAQ.md#my-guest-wall-clock-is-drifting-how-can-i-fix-it). +When using `kvm-clock` as clock source on `x86_64`, it's possible to optionally +set the `clock_realtime: true` in the `LoadSnapshot` request to advance the +clock on the guest at restore time (host Linux >= 5.16 is required to support +this feature). Note that this may cause issues within the guest as the clock +will appear to suddenly jump. ## Provisioning host disk space for snapshots diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8284aa66287..1e540dc2af0 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -110,6 +110,7 @@ fn parse_put_snapshot_load(body: &Body) -> Result { || snapshot_config.track_dirty_pages, resume_vm: snapshot_config.resume_vm, network_overrides: snapshot_config.network_overrides, + clock_realtime: snapshot_config.clock_realtime, }; // Construct the `ParsedRequest` object. @@ -187,6 +188,7 @@ mod tests { track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -217,6 +219,7 @@ mod tests { track_dirty_pages: true, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -247,6 +250,7 @@ mod tests { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -286,6 +290,7 @@ mod tests { iface_id: String::from("eth0"), host_dev_name: String::from("vmtap2"), }], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -313,6 +318,7 @@ mod tests { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, }; let parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert_eq!( diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index c68f7fe04e0..e06e9297a29 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1631,6 +1631,14 @@ definitions: description: Network host device names to override items: $ref: "#/definitions/NetworkOverride" + clock_realtime: + type: boolean + description: + "[x86_64 only] When set to true, passes KVM_CLOCK_REALTIME to + KVM_SET_CLOCK on restore, advancing kvmclock by the wall-clock time + elapsed since the snapshot was taken. When false (default), kvmclock resumes + from where it was at snapshot time. This option may be extended to other clock + sources and CPU architectures in the future." TokenBucket: diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 37d97d8c212..1bcd092b725 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -5,7 +5,7 @@ use std::fmt; use std::sync::{Arc, Mutex}; use kvm_bindings::{ - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, + KVM_CLOCK_REALTIME, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, }; use kvm_ioctls::Cap; @@ -30,6 +30,8 @@ pub enum ArchVmError { SetPit2(kvm_ioctls::Error), /// Set clock error: {0} SetClock(kvm_ioctls::Error), + /// clock_realtime requested but not present in the snapshot state + ClockRealtimeNotInState, /// Set IrqChipPicMaster error: {0} SetIrqChipPicMaster(kvm_ioctls::Error), /// Set IrqChipPicSlave error: {0} @@ -127,13 +129,25 @@ impl ArchVm { /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. - pub fn restore_state(&mut self, state: &VmState) -> Result<(), ArchVmError> { + pub fn restore_state( + &mut self, + state: &VmState, + clock_realtime: bool, + ) -> Result<(), ArchVmError> { self.fd() .set_pit2(&state.pitstate) .map_err(ArchVmError::SetPit2)?; - self.fd() - .set_clock(&state.clock) - .map_err(ArchVmError::SetClock)?; + let mut clock = state.clock; + clock.flags = if clock_realtime { + // clock_realtime needs to be present in the snapshot + if clock.flags & KVM_CLOCK_REALTIME == 0 { + return Err(ArchVmError::ClockRealtimeNotInState); + } + KVM_CLOCK_REALTIME + } else { + 0 + }; + self.fd().set_clock(&clock).map_err(ArchVmError::SetClock)?; self.fd() .set_irqchip(&state.pic_master) .map_err(ArchVmError::SetIrqChipPicMaster)?; @@ -167,9 +181,7 @@ impl ArchVm { pub fn save_state(&self) -> Result { let pitstate = self.fd().get_pit2().map_err(ArchVmError::VmGetPit2)?; - let mut clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?; - // This bit is not accepted in SET_CLOCK, clear it. - clock.flags &= !KVM_CLOCK_TSC_STABLE; + let clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?; let mut pic_master = kvm_irqchip { chip_id: KVM_IRQCHIP_PIC_MASTER, @@ -247,11 +259,15 @@ impl fmt::Debug for VmState { #[cfg(test)] mod tests { + use std::time::SystemTime; + use kvm_bindings::{ - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, + KVM_CLOCK_REALTIME, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, }; + use kvm_ioctls::Cap; + use crate::arch::ArchVmError; use crate::snapshot::Snapshot; use crate::vstate::vm::VmState; use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory}; @@ -271,7 +287,6 @@ mod tests { vm_state.pitstate.flags | KVM_PIT_SPEAKER_DUMMY, KVM_PIT_SPEAKER_DUMMY ); - assert_eq!(vm_state.clock.flags & KVM_CLOCK_TSC_STABLE, 0); assert_eq!(vm_state.pic_master.chip_id, KVM_IRQCHIP_PIC_MASTER); assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE); assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC); @@ -279,7 +294,46 @@ mod tests { let (_, mut vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); - vm.restore_state(&vm_state).unwrap(); + vm.restore_state(&vm_state, false).unwrap(); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_vm_save_restore_state_kvm_clock_realtime() { + let (kvm, vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let clock_realtime_supported = + kvm.fd.check_extension_int(Cap::AdjustClock).cast_unsigned() & KVM_CLOCK_REALTIME != 0; + + // mock a state without realtime information + let mut vm_state = vm.save_state().unwrap(); + vm_state.clock.flags &= !KVM_CLOCK_REALTIME; + + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let res = vm.restore_state(&vm_state, true); + assert!(res == Err(ArchVmError::ClockRealtimeNotInState)); + + // mock a state with realtime information + vm_state.clock.flags |= KVM_CLOCK_REALTIME; + vm_state.clock.realtime = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_nanos() + .try_into() + .unwrap(); + + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let res = vm.restore_state(&vm_state, true); + if clock_realtime_supported { + res.unwrap() + } else { + assert!(matches!(res, Err(ArchVmError::SetClock(err)) if err.errno() == libc::EINVAL)) + } } #[cfg(target_arch = "x86_64")] @@ -297,18 +351,18 @@ mod tests { // Try to restore an invalid PIC Master chip ID let orig_master_chip_id = vm_state.pic_master.chip_id; vm_state.pic_master.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); vm_state.pic_master.chip_id = orig_master_chip_id; // Try to restore an invalid PIC Slave chip ID let orig_slave_chip_id = vm_state.pic_slave.chip_id; vm_state.pic_slave.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); vm_state.pic_slave.chip_id = orig_slave_chip_id; // Try to restore an invalid IOPIC chip ID vm_state.ioapic.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); } #[cfg(target_arch = "x86_64")] @@ -326,6 +380,6 @@ mod tests { .unwrap() .data; - vm.restore_state(&restored_state).unwrap(); + vm.restore_state(&restored_state, false).unwrap(); } } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 332b1ac3cc3..689122a3e07 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -423,6 +423,8 @@ pub enum BuildMicrovmFromSnapshotError { SeccompFiltersInternal(#[from] crate::seccomp::InstallationError), /// Failed to restore devices: {0} RestoreDevices(#[from] DevicePersistError), + /// clock_realtime is not supported on aarch64. + UnsupportedClockRealtime, } /// Builds and starts a microVM based on the provided MicrovmState. @@ -438,6 +440,7 @@ pub fn build_microvm_from_snapshot( uffd: Option, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, + clock_realtime: bool, ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); @@ -479,6 +482,9 @@ pub fn build_microvm_from_snapshot( #[cfg(target_arch = "aarch64")] { + if clock_realtime { + return Err(BuildMicrovmFromSnapshotError::UnsupportedClockRealtime); + } let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. vm.restore_state(&mpidrs, µvm_state.vm_state)?; @@ -486,7 +492,7 @@ pub fn build_microvm_from_snapshot( // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state, clock_realtime)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index ba2608070c6..ad954c041d1 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -427,6 +427,7 @@ pub fn restore_from_snapshot( uffd, seccomp_filters, vm_resources, + params.clock_realtime, ) .map_err(RestoreFromSnapshotError::Build) } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index fdd0862a9d4..f186447d3dd 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1369,6 +1369,7 @@ mod tests { track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }, ))); check_unsupported(runtime_request(VmmAction::SetEntropyDevice( diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 13a87ba30c4..b429c9a241a 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -72,6 +72,10 @@ pub struct LoadSnapshotParams { pub resume_vm: bool, /// The network devices to override on load. pub network_overrides: Vec, + /// [x86_64 only] When set to true, passes `KVM_CLOCK_REALTIME` to `KVM_SET_CLOCK` on restore, + /// advancing kvmclock by the wall-clock time elapsed since the snapshot was taken. When false + /// (default), kvmclock resumes from where it was at snapshot time. + pub clock_realtime: bool, } /// Stores the configuration for loading a snapshot that is provided by the user. @@ -101,6 +105,9 @@ pub struct LoadSnapshotConfig { /// The network devices to override on load. #[serde(default)] pub network_overrides: Vec, + /// [x86_64 only] When set to true, passes `KVM_CLOCK_REALTIME` to `KVM_SET_CLOCK` on restore. + #[serde(default)] + pub clock_realtime: bool, } /// Stores the configuration used for managing snapshot memory. diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 83e899eff1d..37e9039b8e8 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -885,7 +885,7 @@ pub(crate) mod tests { let restored_state: VmState = Snapshot::load_without_crc_check(snapshot_data.as_slice()) .unwrap() .data; - vm.restore_state(&restored_state).unwrap(); + vm.restore_state(&restored_state, false).unwrap(); let mut resource_allocator = vm.resource_allocator(); let gsi_new = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6a5e6a08a14..3a546643715 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -302,6 +302,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, })) .unwrap(); @@ -386,6 +387,7 @@ fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &s track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }); let err = preboot_api_controller.handle_preboot_request(req); assert!( diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index c8babc9f54b..7ba32305187 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -1076,6 +1076,7 @@ def restore_from_snapshot( snapshot: Snapshot, resume: bool = False, rename_interfaces: dict = None, + clock_realtime: bool = False, *, uffd_handler_name: str = None, ): @@ -1132,6 +1133,9 @@ def restore_from_snapshot( # can be inline in the snapshot_load command below optional_kwargs["network_overrides"] = iface_overrides + if clock_realtime: + optional_kwargs["clock_realtime"] = clock_realtime + self.api.snapshot_load.put( mem_backend=mem_backend, snapshot_path=str(jailed_vmstate), @@ -1286,12 +1290,17 @@ def build(self, kernel=None, rootfs=None, **kwargs): vm.ssh_key = ssh_key return vm - def build_from_snapshot(self, snapshot: Snapshot, uffd_handler_name=None): + def build_from_snapshot( + self, snapshot: Snapshot, uffd_handler_name=None, clock_realtime=False + ): """Build a microvm from a snapshot""" vm = self.build() vm.spawn() vm.restore_from_snapshot( - snapshot, resume=True, uffd_handler_name=uffd_handler_name + snapshot, + resume=True, + uffd_handler_name=uffd_handler_name, + clock_realtime=clock_realtime, ) return vm diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 3695941fc5d..dc3a1f761dd 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -584,3 +584,105 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): rename_interfaces={iface_override.dev_name: iface_override.tap_name}, resume=True, ) + + +SLEEP_SECONDS = 30 + +CLOCK_SOURCES = {"x86_64": ["tsc", "kvm-clock"], "aarch64": ["arch_sys_counter"]}[ + global_props.cpu_architecture +] + + +def read_guest_monotonic(vm): + """Read CLOCK_MONOTONIC inside the guest""" + _, stdout, _ = vm.ssh.check_output( + "python3 -c 'import time; print(time.monotonic())'" + ) + return float(stdout.strip()) + + +def read_guest_clocksource(vm): + """Read the active clocksource inside the guest""" + _, stdout, _ = vm.ssh.check_output( + "cat /sys/devices/system/clocksource/clocksource0/current_clocksource" + ) + return stdout.strip() + + +@pytest.mark.parametrize("clocksource", CLOCK_SOURCES) +@pytest.mark.parametrize("clock_realtime", [False, True]) +def test_clocksource_snapshot_restore( + uvm_plain_any, microvm_factory, clocksource, clock_realtime +): + """Measure CLOCK_MONOTONIC before snapshot and after restore to determine + whether the clocksource jumps forward or resumes from where it left off.""" + + if clock_realtime and clocksource != "kvm-clock": + pytest.skip(f"Clocksource {clocksource} doesn't support clock_realtime flag") + if clock_realtime and global_props.host_linux_version_tpl < (5, 16): + pytest.skip("clock_realtime is not supported on Linux < 5.16") + + boot_args = ( + "reboot=k panic=1 nomodule swiotlb=noforce console=ttyS0" + f" clocksource={clocksource}" + ) + + vm = uvm_plain_any + vm.spawn() + vm.basic_config(vcpu_count=2, mem_size_mib=256, boot_args=boot_args) + vm.add_net_iface() + vm.start() + + # Confirm the clocksource took effect + active = read_guest_clocksource(vm) + _, avail_out, _ = vm.ssh.check_output( + "cat /sys/devices/system/clocksource/clocksource0/available_clocksource" + ) + print("Available clocksources: %s", avail_out.strip()) + if active != clocksource: + pytest.skip(f"Clocksource {clocksource} not available") + + guest_before = read_guest_monotonic(vm) + host_before = time.monotonic() + + snapshot = vm.snapshot_full() + vm.kill() + + print("Sleeping %ds between snapshot and restore...", SLEEP_SECONDS) + time.sleep(SLEEP_SECONDS) + + restored_vm = microvm_factory.build_from_snapshot( + snapshot, clock_realtime=clock_realtime + ) + + guest_after = read_guest_monotonic(restored_vm) + host_after = time.monotonic() + + # Confirm clocksource survived the restore + active_after = read_guest_clocksource(restored_vm) + assert ( + active_after == clocksource + ), f"Clocksource changed after restore: {clocksource} -> {active_after}" + + guest_delta = guest_after - guest_before + host_delta = host_after - host_before + + # If guest_delta is close to host_delta, the clock jumped forward + # (suspend/resume behavior). If it's near 0, it resumed from where + # it left off. + jumped = abs(guest_delta - host_delta) < 5.0 + + jumped_str = "JUMPED" if jumped else "RESUMED" + + print( + f"Host kernel: {global_props.host_linux_version}\n" + f"Clocksource: {clocksource}\n" + f"Guest MONOTONIC before: {guest_before:.3f} s\n" + f"Guest MONOTONIC after: {guest_after:.3f} s\n" + f"Guest delta: {guest_delta:.3f} s\n" + f"Host delta: {host_delta:.3f} s\n" + f"Behavior: {jumped_str}\n" + ) + assert ( + jumped == clock_realtime + ), f"Clock {jumped_str} but clock_realtime was {"not" if clock_realtime else ""} set." From ce5ef6c45b3a7616a0e9b6a868a29559c735dc06 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 2 Apr 2026 11:47:30 +0100 Subject: [PATCH 40/53] doc: fix line in design about only kvm-clock being available We actually support all of them. Signed-off-by: Riccardo Mancini --- docs/design.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/design.md b/docs/design.md index b35b845b8b3..9a7e409a34d 100644 --- a/docs/design.md +++ b/docs/design.md @@ -118,7 +118,11 @@ and/or creating their own custom CPU templates. #### Clocksources available to guests -Firecracker only exposes kvm-clock to customers. +Firecracker exposes the following clock sources to guests: + +- x86_64: kvm-clock and tsc. Linux guests >=5.10 will pick tsc by default if + stable. +- aarch64: arch_sys_counter ### I/O: Storage, Networking and Rate Limiting From 97e119947a728c2712fa6c0fe22fe5062f7447f4 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 2 Apr 2026 18:52:16 +0100 Subject: [PATCH 41/53] chore: release v1.14.4 Update version number / CHANGELOG / CREDITS Signed-off-by: Riccardo Mancini --- Cargo.lock | 12 ++++++------ src/cpu-template-helper/Cargo.toml | 2 +- src/firecracker/Cargo.toml | 2 +- src/firecracker/swagger/firecracker.yaml | 2 +- src/jailer/Cargo.toml | 2 +- src/rebase-snap/Cargo.toml | 2 +- src/seccompiler/Cargo.toml | 2 +- src/snapshot-editor/Cargo.toml | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1c9b4fef3c4..009f6ce23da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -355,7 +355,7 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" -version = "1.14.3" +version = "1.14.4" dependencies = [ "clap", "displaydoc", @@ -518,7 +518,7 @@ checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "firecracker" -version = "1.14.3" +version = "1.14.4" dependencies = [ "cargo_toml", "displaydoc", @@ -667,7 +667,7 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jailer" -version = "1.14.3" +version = "1.14.4" dependencies = [ "libc", "log-instrument", @@ -1032,7 +1032,7 @@ dependencies = [ [[package]] name = "rebase-snap" -version = "1.14.3" +version = "1.14.4" dependencies = [ "displaydoc", "libc", @@ -1126,7 +1126,7 @@ dependencies = [ [[package]] name = "seccompiler" -version = "1.14.3" +version = "1.14.4" dependencies = [ "bincode", "clap", @@ -1223,7 +1223,7 @@ checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "snapshot-editor" -version = "1.14.3" +version = "1.14.4" dependencies = [ "clap", "clap-num", diff --git a/src/cpu-template-helper/Cargo.toml b/src/cpu-template-helper/Cargo.toml index 910b38a8d5b..6d674bb6c1d 100644 --- a/src/cpu-template-helper/Cargo.toml +++ b/src/cpu-template-helper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cpu-template-helper" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 8b6a5e1087a..68f22554e77 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecracker" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" build = "build.rs" diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index e06e9297a29..828bb086198 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -5,7 +5,7 @@ info: The API is accessible through HTTP calls on specific URLs carrying JSON modeled data. The transport medium is a Unix Domain Socket. - version: 1.14.3 + version: 1.14.4 termsOfService: "" contact: email: "firecracker-maintainers@amazon.com" diff --git a/src/jailer/Cargo.toml b/src/jailer/Cargo.toml index 46195fb922a..03a3aeceb10 100644 --- a/src/jailer/Cargo.toml +++ b/src/jailer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jailer" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" description = "Process for starting Firecracker in production scenarios; applies a cgroup/namespace isolation barrier and then drops privileges." diff --git a/src/rebase-snap/Cargo.toml b/src/rebase-snap/Cargo.toml index 69457726b8f..eb96d56ec16 100644 --- a/src/rebase-snap/Cargo.toml +++ b/src/rebase-snap/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rebase-snap" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/seccompiler/Cargo.toml b/src/seccompiler/Cargo.toml index 657c5e6bfab..d83f7b73d2f 100644 --- a/src/seccompiler/Cargo.toml +++ b/src/seccompiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "seccompiler" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" description = "Program that compiles multi-threaded seccomp-bpf filters expressed as JSON into raw BPF programs, serializing them and outputting them to a file." diff --git a/src/snapshot-editor/Cargo.toml b/src/snapshot-editor/Cargo.toml index d358db4e39e..eae727c2030 100644 --- a/src/snapshot-editor/Cargo.toml +++ b/src/snapshot-editor/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snapshot-editor" -version = "1.14.3" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" From 567170a4d0bf14c7910d7d58fd16abd228dfdcae Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 8 Apr 2026 07:39:18 +0000 Subject: [PATCH 42/53] chore: Fill placefolder for PR number of virtio PCI transport fix The change was made privately, so the PR number wans't available. Signed-off-by: Takahiro Itazuri --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c513dd1f60a..b6a5ec5d61a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,14 +15,14 @@ and this project adheres to a descriptor chain that caused Firecracker to allocate more host memory than the guest actually provided, potentially leading to excessive host memory consumption. -- [#XXXX](https://github.com/firecracker-microvm/firecracker/pull/XXXX): Enforce +- [#5818](https://github.com/firecracker-microvm/firecracker/pull/5818): Enforce the virtio device initialization sequence in the PCI transport, matching the existing MMIO transport behavior. The PCI transport now validates device status transitions, rejects queue configuration writes outside the FEATURES_OK to DRIVER_OK window, rejects feature negotiation outside the DRIVER state, blocks re-initialization after a failed reset, and sets DEVICE_NEEDS_RESET when device activation fails. -- [#XXXX](https://github.com/firecracker-microvm/firecracker/pull/XXXX): Reject +- [#5818](https://github.com/firecracker-microvm/firecracker/pull/5818): Reject device status writes that clear previously set bits in the MMIO transport, except for reset. - [#5780](https://github.com/firecracker-microvm/firecracker/pull/5780): Fixed From cd35b5ab0273a0e90f812dfcc8f5d1cc4945cfd4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Feb 2026 10:56:11 -0800 Subject: [PATCH 43/53] swagger: add APIs for getting guest memory info Add a few APIs to get information about guest memory: * An endpoint for guest memory mappings (guest physical to host virtual). * An endpoint for resident and empty pages. * An endpoint for dirty pages. Signed-off-by: Babis Chalios --- src/firecracker/swagger/firecracker.yaml | 110 +++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 828bb086198..e0a7f410eaf 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -786,6 +786,50 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory/dirty: + get: + summary: Gets the dirty guest memory + description: This returns the resident memory that has been written since last snapshot. + operationId: getDirtyMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/DirtyMemory" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -1347,6 +1391,72 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + + DirtyMemory: + type: object + description: Response containing the bitmap (one bit per page) of dirty pages of guest memory + required: + - bitmap + properties: + bitmap: + type: array + description: The dirty bitmap as a vector of u64 values. Each bit represents if the page is dirty. + items: + type: integer + format: uint64 + Logger: type: object description: From 2035236d64a006bdfb62c1bf56fb82cc9ed701c5 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Feb 2026 13:57:54 -0800 Subject: [PATCH 44/53] snapshot: make memory path optional in snapshot creation There are cases where a user might want to snapshot the memoyr of a VM externally. In these cases, we can ask Firecracker to avoid serializing the memory file to disk when we create a snapshot. Signed-off-by: Babis Chalios --- src/firecracker/src/api_server/mod.rs | 4 ++-- src/firecracker/src/api_server/request/snapshot.rs | 4 ++-- src/firecracker/swagger/firecracker.yaml | 6 ++++-- src/vmm/src/persist.rs | 4 +++- src/vmm/src/rpc_interface.rs | 2 +- src/vmm/src/vmm_config/snapshot.rs | 2 +- src/vmm/tests/integration_tests.rs | 2 +- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 60daaa26639..961fc68e836 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -275,7 +275,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -288,7 +288,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 1e540dc2af0..0f562b021b5 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -145,7 +145,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -159,7 +159,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index e0a7f410eaf..a9c1cc33212 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1665,12 +1665,14 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: type: string - description: Path to the file that will contain the guest memory. + description: + Path to the file that will contain the guest memory. It is optional. + In case that a user doesn't provide a path, they are responsible to + ensure they store the microVM's memory state via external means. snapshot_path: type: string description: Path to the file that will contain the microVM state. diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index ad954c041d1..6c1e6a00594 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -161,8 +161,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; + if let Some(mem_file_path) = params.mem_file_path.as_ref() { vmm.vm - .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + .snapshot_memory_to_file(mem_file_path, params.snapshot_type)?; + } // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index f186447d3dd..4a4e50681da 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1243,7 +1243,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index b429c9a241a..393ae945498 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,7 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + pub mem_file_path: Option, } /// Allows for changing the mapping between tap devices and host devices diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 3a546643715..4d09fe62b0f 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -235,7 +235,7 @@ fn verify_create_snapshot( let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller From 160c3af98ab4def9dd23ab51ba37a14c911ffc57 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Feb 2026 16:26:36 -0800 Subject: [PATCH 45/53] api: implement API for getting guest memory mappings Implement API /memory/mappings which returns the memory mappings of guest physical to host virtual memory. Signed-off-by: Babis Chalios --- .../src/api_server/parsed_request.rs | 6 +++++ .../src/api_server/request/memory_info.rs | 21 ++++++++++++++++ src/firecracker/src/api_server/request/mod.rs | 1 + src/vmm/src/lib.rs | 24 ++++++++++++++++++- src/vmm/src/rpc_interface.rs | 22 ++++++++++++++++- src/vmm/src/vmm_config/meminfo.rs | 11 +++++++++ src/vmm/src/vmm_config/mod.rs | 2 ++ 7 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 src/firecracker/src/api_server/request/memory_info.rs create mode 100644 src/vmm/src/vmm_config/meminfo.rs diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index f98170ccbea..cb6d3b6f6ef 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -31,6 +31,7 @@ use super::request::vsock::parse_put_vsock; use crate::api_server::request::hotplug::memory::{ parse_get_memory_hotplug, parse_patch_memory_hotplug, parse_put_memory_hotplug, }; +use crate::api_server::request::memory_info::parse_get_memory; use crate::api_server::request::serial::parse_put_serial; #[derive(Debug)] @@ -91,6 +92,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Get, "hotplug", None) if path_tokens.next() == Some("memory") => { parse_get_memory_hotplug() } + (Method::Get, "memory", None) => parse_get_memory(path_tokens), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), (Method::Put, "balloon", Some(body)) => parse_put_balloon(body), @@ -196,6 +198,7 @@ impl ParsedRequest { &serde_json::json!({ "firecracker_version": version.as_str() }), ), VmmData::FullVmConfig(config) => Self::success_response_with_data(config), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -610,6 +613,9 @@ pub mod tests { &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, ), + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs new file mode 100644 index 00000000000..0d34b542180 --- /dev/null +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -0,0 +1,21 @@ +use micro_http::{Method, StatusCode}; +use vmm::rpc_interface::VmmAction; + +use crate::api_server::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory<'a, T>(mut path_tokens: T) -> Result +where + T: Iterator, +{ + match path_tokens.next() { + Some("mappings") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)), + Some(unknown_path) => Err(RequestError::InvalidPathMethod( + format!("/memory/{}", unknown_path), + Method::Get, + )), + None => Err(RequestError::Generic( + StatusCode::BadRequest, + "Missing memory info type.".to_string(), + )), + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 9be4617bd8e..89472c52d8e 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -11,6 +11,7 @@ pub mod hotplug; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_info; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30273e92c06..a23e3d3dcbb 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -144,8 +144,9 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::mem::{VIRTIO_MEM_DEV_ID, VirtioMem, VirtioMemError, VirtioMemStatus}; use crate::devices::virtio::net::Net; use crate::logger::{METRICS, MetricsError, error, info, warn}; -use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; +use crate::persist::{GuestRegionUffdMapping, MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; +use crate::utils::usize_to_u64; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; @@ -690,6 +691,27 @@ impl Vmm { pub fn vm(&self) -> &Vm { &self.vm } + + /// Get the list of mappings for guest memory + pub fn guest_memory_mappings(&self, page_size: usize) -> Vec { + let mut mappings = vec![]; + let mut offset = 0; + + for region in self.vm.guest_memory().iter() { + #[allow(deprecated)] + mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: region.as_ptr() as u64, + size: region.size(), + offset, + page_size, + page_size_kib: page_size, + }); + + offset += usize_to_u64(region.size()); + } + + mappings + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 4a4e50681da..13ff0898bb2 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -30,6 +30,7 @@ use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, Drive use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; +use crate::vmm_config::meminfo::MemoryMapingsResponse; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -146,6 +147,8 @@ pub enum VmmAction { /// Update the microVM configuration (memory & vcpu) using `VmUpdateConfig` as input. This /// action can only be called before the microVM has booted. UpdateMachineConfiguration(MachineConfigUpdate), + /// Get the guest memory mappings to host memory + GetMemoryMappings, } /// Wrapper for all errors associated with VMM actions. @@ -228,6 +231,8 @@ pub enum VmmData { VirtioMemStatus(VirtioMemStatus), /// The status of the virtio-balloon hinting run HintingStatus(HintingStatus), + /// The guest memory mapping information. + MemoryMappings(MemoryMapingsResponse), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -495,7 +500,8 @@ impl<'a> PrebootApiController<'a> { | UpdateNetworkInterface(_) | StartFreePageHinting(_) | GetFreePageHintingStatus - | StopFreePageHinting => Err(VmmActionError::OperationNotSupportedPreBoot), + | StopFreePageHinting + | GetMemoryMappings => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -771,6 +777,7 @@ impl RuntimeApiController { .update_memory_hotplug_size(cfg.requested_size_mib) .map(|_| VmmData::Empty) .map_err(VmmActionError::MemoryHotplugUpdate), + GetMemoryMappings => self.get_guest_memory_mappings(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -937,6 +944,19 @@ impl RuntimeApiController { .map_err(NetworkInterfaceError::DeviceUpdate) .map_err(VmmActionError::NetworkConfig) } + + /// Get guest memory mappings + fn get_guest_memory_mappings(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let mappings = vmm.guest_memory_mappings(page_size); + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory mappings' VMM action took {elapsed_time_us} us."); + Ok(VmmData::MemoryMappings(MemoryMapingsResponse { mappings })) + } } #[cfg(test)] diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs new file mode 100644 index 00000000000..788bda78389 --- /dev/null +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -0,0 +1,11 @@ +use serde::Serialize; + +use crate::persist::GuestRegionUffdMapping; + +/// Serializeable struct that contains information about guest's memory mappings +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryMapingsResponse { + /// Vector with mappings from guest physical to host virtual memoryv + pub mappings: Vec, +} + diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 9a4c104ce3a..c593b3ec0dc 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -20,6 +20,8 @@ pub mod entropy; pub mod instance_info; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for getting memory-related information. +pub mod meminfo; /// Wrapper for configuring memory hotplug. pub mod memory_hotplug; /// Wrapper for configuring the metrics. From cd3fe9a77bdbb232e00b05c9f08d3e138daa04cd Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Feb 2026 14:57:13 -0800 Subject: [PATCH 46/53] api: implement API for resident and zero memory Implement API /memory which returns two bitmaps: resident and empty. `resident` tracks whether a guest page is in the resident set and `empty` tracks whether it's actually all 0s. Both bitmaps are structures as vectors of u64, so their length is: total_number_of_pages.div_ceil(64). Pages are ordered in the order of pages as reported by/memory/mappings. Signed-off-by: Babis Chalios --- .../src/api_server/parsed_request.rs | 4 ++ .../src/api_server/request/memory_info.rs | 7 +- src/vmm/src/builder.rs | 4 ++ src/vmm/src/lib.rs | 70 +++++++++++++++++-- src/vmm/src/persist.rs | 4 +- src/vmm/src/rpc_interface.rs | 33 ++++++++- src/vmm/src/vmm_config/meminfo.rs | 10 +++ src/vmm/src/vstate/vm.rs | 22 ++++-- 8 files changed, 134 insertions(+), 20 deletions(-) diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index cb6d3b6f6ef..057e9718fae 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -199,6 +199,7 @@ impl ParsedRequest { ), VmmData::FullVmConfig(config) => Self::success_response_with_data(config), VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(meminfo) => Self::success_response_with_data(meminfo), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -616,6 +617,9 @@ pub mod tests { VmmData::MemoryMappings(mappings) => { http_response(&serde_json::to_string(mappings).unwrap(), 200) } + VmmData::Memory(meminfo) => { + http_response(&serde_json::to_string(meminfo).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs index 0d34b542180..150916a4c2f 100644 --- a/src/firecracker/src/api_server/request/memory_info.rs +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -1,4 +1,4 @@ -use micro_http::{Method, StatusCode}; +use micro_http::Method; use vmm::rpc_interface::VmmAction; use crate::api_server::parsed_request::{ParsedRequest, RequestError}; @@ -13,9 +13,6 @@ where format!("/memory/{}", unknown_path), Method::Get, )), - None => Err(RequestError::Generic( - StatusCode::BadRequest, - "Missing memory info type.".to_string(), - )), + None => Ok(ParsedRequest::new_sync(VmmAction::GetMemory)), } } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 689122a3e07..15be948861d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -318,6 +318,7 @@ pub fn build_microvm_for_boot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; let vmm = Arc::new(Mutex::new(vmm)); @@ -524,6 +525,7 @@ pub fn build_microvm_from_snapshot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. @@ -757,6 +759,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::arch::host_page_size; use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::generated::virtio_ids; @@ -842,6 +845,7 @@ pub(crate) mod tests { vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager: default_device_manager(), + page_size: host_page_size(), } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index a23e3d3dcbb..2973a7089df 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -152,6 +152,7 @@ use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use crate::vstate::vm::mincore_bitmap; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -314,6 +315,8 @@ pub struct Vmm { vcpus_exit_evt: EventFd, // Device manager device_manager: DeviceManager, + /// Page size used for backing guest memory + pub page_size: usize, } impl Vmm { @@ -697,21 +700,80 @@ impl Vmm { let mut mappings = vec![]; let mut offset = 0; - for region in self.vm.guest_memory().iter() { + for region in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let size = region.slice.len(); #[allow(deprecated)] mappings.push(GuestRegionUffdMapping { - base_host_virt_addr: region.as_ptr() as u64, - size: region.size(), + base_host_virt_addr: region.slice.ptr_guard_mut().as_ptr() as u64, + size, offset, page_size, page_size_kib: page_size, }); - offset += usize_to_u64(region.size()); + offset += usize_to_u64(size); } mappings } + + /// Get info regarding resident and empty pages for guest memory + pub fn guest_memory_info(&self, page_size: usize) -> Result<(Vec, Vec), VmmError> { + let mut resident = vec![]; + let mut empty = vec![]; + let zero_page = vec![0u8; page_size]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + debug_assert!(mem_slot.slice.len().is_multiple_of(page_size)); + debug_assert!( + (mem_slot.slice.ptr_guard_mut().as_ptr() as usize).is_multiple_of(page_size) + ); + + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + let addr = mem_slot.slice.ptr_guard_mut().as_ptr(); + let mut curr_empty = vec![0u64; nr_pages.div_ceil(64)]; + let curr_resident = mincore_bitmap(addr, mem_slot.slice.len(), page_size)?; + + for page_idx in 0..nr_pages { + if (curr_resident[page_idx / 64] & (1u64 << (page_idx % 64))) == 0 { + continue; + } + + // SAFETY: `addr` points to a memory region that is `nr_pages * page_size` long. + let curr_addr = unsafe { addr.add(page_idx * page_size) }; + + // SAFETY: both addresses are valid and they point to a memory region + // that is (at least) `page_size` long + let ret = unsafe { + libc::memcmp( + curr_addr.cast::(), + zero_page.as_ptr().cast::(), + page_size, + ) + }; + + if ret == 0 { + curr_empty[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + + resident.extend_from_slice(&curr_resident); + empty.extend_from_slice(&curr_empty); + } + + Ok((resident, empty)) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 6c1e6a00594..29be397ad5d 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -162,8 +162,8 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; if let Some(mem_file_path) = params.mem_file_path.as_ref() { - vmm.vm - .snapshot_memory_to_file(mem_file_path, params.snapshot_type)?; + vmm.vm + .snapshot_memory_to_file(mem_file_path, params.snapshot_type, vmm.page_size)?; } // We need to mark queues as dirty again for all activated devices. The reason we diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 13ff0898bb2..6bbfa6eab7d 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -28,9 +28,9 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; -use crate::vmm_config::meminfo::MemoryMapingsResponse; +use crate::vmm_config::meminfo::{MemoryMapingsResponse, MemoryResponse}; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -149,6 +149,8 @@ pub enum VmmAction { UpdateMachineConfiguration(MachineConfigUpdate), /// Get the guest memory mappings to host memory GetMemoryMappings, + /// Get guest memory resident and empty pages information + GetMemory, } /// Wrapper for all errors associated with VMM actions. @@ -200,6 +202,8 @@ pub enum VmmActionError { OperationNotSupportedPostBoot, /// The requested operation is not supported before starting the microVM. OperationNotSupportedPreBoot, + /// The requested operation is not supported while the microVM is running. + OperationNotSupportedWhileRunning, /// Start microvm error: {0} StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} @@ -233,6 +237,8 @@ pub enum VmmData { HintingStatus(HintingStatus), /// The guest memory mapping information. MemoryMappings(MemoryMapingsResponse), + /// The guest memory resident and empty pages information + Memory(MemoryResponse), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -501,7 +507,8 @@ impl<'a> PrebootApiController<'a> { | StartFreePageHinting(_) | GetFreePageHintingStatus | StopFreePageHinting - | GetMemoryMappings => Err(VmmActionError::OperationNotSupportedPreBoot), + | GetMemoryMappings + | GetMemory => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -778,6 +785,7 @@ impl RuntimeApiController { .map(|_| VmmData::Empty) .map_err(VmmActionError::MemoryHotplugUpdate), GetMemoryMappings => self.get_guest_memory_mappings(), + GetMemory => self.get_guest_memory_info(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -957,6 +965,25 @@ impl RuntimeApiController { info!("'get memory mappings' VMM action took {elapsed_time_us} us."); Ok(VmmData::MemoryMappings(MemoryMapingsResponse { mappings })) } + + /// Get resident and empty pages information for guest memory + fn get_guest_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let (resident, empty) = vmm.guest_memory_info(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory info' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::Memory(MemoryResponse { resident, empty })) + } } #[cfg(test)] diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs index 788bda78389..1169eb432c2 100644 --- a/src/vmm/src/vmm_config/meminfo.rs +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -9,3 +9,13 @@ pub struct MemoryMapingsResponse { pub mappings: Vec, } +/// Information about guest memory resident pages and pages that are all-0s +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// Bitmap for resident pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is present in the resident memory set + pub resident: Vec, + /// Bitmap for empty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is empty (all 0s). + pub empty: Vec, +} diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 37e9039b8e8..0ecf2ef94b7 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -305,7 +305,7 @@ impl Vm { } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn get_dirty_bitmap(&self) -> Result { + pub fn get_dirty_bitmap(&self, page_size: usize) -> Result { self.guest_memory() .iter() .flat_map(|region| region.plugged_slots()) @@ -318,6 +318,7 @@ impl Vm { None => mincore_bitmap( mem_slot.slice.ptr_guard_mut().as_ptr(), mem_slot.slice.len(), + page_size, )?, }; Ok((mem_slot.slot, bitmap)) @@ -335,6 +336,7 @@ impl Vm { &self, mem_file_path: &Path, snapshot_type: SnapshotType, + page_size: usize, ) -> Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; @@ -377,7 +379,7 @@ impl Vm { match snapshot_type { SnapshotType::Diff => { - let dirty_bitmap = self.get_dirty_bitmap()?; + let dirty_bitmap = self.get_dirty_bitmap(page_size)?; self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { @@ -503,7 +505,11 @@ impl Vm { /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used /// if a diff snapshot is requested, but dirty page tracking wasn't enabled. -fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { +pub(crate) fn mincore_bitmap( + addr: *mut u8, + len: usize, + page_size: usize, +) -> Result, VmError> { // TODO: Once Host 5.10 goes out of support, we can make this more robust and work on // swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to // force swapped-out pages to get paged in, so that mincore will consider them incore). @@ -513,8 +519,11 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { // Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with // is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will // give us 512 4k markers with the lowest bit set). - let page_size = host_page_size(); - let mut mincore_bitmap = vec![0u8; len / page_size]; + let host_page_size = host_page_size(); + let mut mincore_bitmap = vec![0u8; len / host_page_size]; + // The bitmap we return though tracks pages in terms of the actually used page size. In + // the case of a hugetlbfs VMA, we just need to check if the first of the reported pages + // is present. let mut bitmap = vec![0u64; (len / page_size).div_ceil(64)]; // SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid @@ -529,7 +538,8 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { return Err(VmError::Mincore(vmm_sys_util::errno::Error::last())); } - for (page_idx, b) in mincore_bitmap.iter().enumerate() { + let step = page_size / host_page_size; + for (page_idx, b) in mincore_bitmap.iter().step_by(step).enumerate() { bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64); } From 92eacbd305a08bd8d36c1ea12e0d3148b5c1b94e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Feb 2026 15:48:06 -0800 Subject: [PATCH 47/53] api: implement API for dirty memory Implement API /memory/dirty which returns a bitmap tracking dirty guest memory. The bitmap is structured as a vector of u64, so its length is: total_number_of_pages.div_ceil(64). Pages are ordered in the order of pages as reported by /memory/mappings. Signed-off-by: Babis Chalios --- .../seccomp/x86_64-unknown-linux-musl.json | 3 + .../src/api_server/parsed_request.rs | 4 + .../src/api_server/request/memory_info.rs | 1 + src/vmm/src/lib.rs | 45 +++++++ src/vmm/src/rpc_interface.rs | 29 ++++- src/vmm/src/utils/mod.rs | 2 + src/vmm/src/utils/pagemap.rs | 115 ++++++++++++++++++ src/vmm/src/vmm_config/meminfo.rs | 8 ++ 8 files changed, 205 insertions(+), 2 deletions(-) create mode 100644 src/vmm/src/utils/pagemap.rs diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index dcd6753a4c5..1eb2d83e0f2 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -31,6 +31,9 @@ { "syscall": "mincore" }, + { + "syscall": "pread64" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index 057e9718fae..478483e9ad9 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -200,6 +200,7 @@ impl ParsedRequest { VmmData::FullVmConfig(config) => Self::success_response_with_data(config), VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), VmmData::Memory(meminfo) => Self::success_response_with_data(meminfo), + VmmData::MemoryDirty(dirty) => Self::success_response_with_data(dirty), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -620,6 +621,9 @@ pub mod tests { VmmData::Memory(meminfo) => { http_response(&serde_json::to_string(meminfo).unwrap(), 200) } + VmmData::MemoryDirty(dirty) => { + http_response(&serde_json::to_string(dirty).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs index 150916a4c2f..2d8e55a420e 100644 --- a/src/firecracker/src/api_server/request/memory_info.rs +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -9,6 +9,7 @@ where { match path_tokens.next() { Some("mappings") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)), + Some("dirty") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryDirty)), Some(unknown_path) => Err(RequestError::InvalidPathMethod( format!("/memory/{}", unknown_path), Method::Get, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 2973a7089df..06144a5ddd9 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -256,6 +256,8 @@ pub enum VmmError { Block(#[from] BlockError), /// Balloon: {0} Balloon(#[from] BalloonError), + /// Pagemap error: {0} + Pagemap(#[from] utils::pagemap::PagemapError), /// Failed to create memory hotplug device: {0} VirtioMem(#[from] VirtioMemError), } @@ -774,6 +776,49 @@ impl Vmm { Ok((resident, empty)) } + + /// Get dirty pages bitmap for guest memory + pub fn get_dirty_memory(&self, page_size: usize) -> Result, VmmError> { + let pagemap = utils::pagemap::PagemapReader::new(page_size)?; + let mut dirty_bitmap = vec![]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let base_addr = mem_slot.slice.ptr_guard_mut().as_ptr() as usize; + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + + // Use mincore_bitmap to get resident pages at guest page size granularity + let resident_bitmap = vstate::vm::mincore_bitmap(base_addr as *mut u8, len, page_size)?; + + // TODO: if we don't support UFFD/async WP, we can completely skip this bit. For the + // time being, we always do. + // + // Build dirty bitmap: check pagemap only for pages that mincore reports resident. + // This way we reduce the amount of times we read out of /proc//pagemap. + let mut slot_bitmap = vec![0u64; nr_pages.div_ceil(64)]; + for page_idx in 0..nr_pages { + // Check if page is resident in the bitmap. + // TODO: These operations (add to bitmap, check for presence, etc.) merit their own + // implementation, somewhere within a bitmap type). + let is_resident = (resident_bitmap[page_idx / 64] & (1u64 << (page_idx % 64))) != 0; + if is_resident { + let virt_addr = base_addr + (page_idx * page_size); + if pagemap.is_page_dirty(virt_addr)? { + slot_bitmap[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + } + + dirty_bitmap.extend_from_slice(&slot_bitmap); + } + + Ok(dirty_bitmap) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 6bbfa6eab7d..78f3a254518 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -30,7 +30,7 @@ use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, Drive use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; -use crate::vmm_config::meminfo::{MemoryMapingsResponse, MemoryResponse}; +use crate::vmm_config::meminfo::{MemoryDirty, MemoryMapingsResponse, MemoryResponse}; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -151,6 +151,8 @@ pub enum VmmAction { GetMemoryMappings, /// Get guest memory resident and empty pages information GetMemory, + /// Get guest memory dirty pages information + GetMemoryDirty, } /// Wrapper for all errors associated with VMM actions. @@ -239,6 +241,8 @@ pub enum VmmData { MemoryMappings(MemoryMapingsResponse), /// The guest memory resident and empty pages information Memory(MemoryResponse), + /// The guest memory dirty pages information + MemoryDirty(MemoryDirty), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -508,7 +512,8 @@ impl<'a> PrebootApiController<'a> { | GetFreePageHintingStatus | StopFreePageHinting | GetMemoryMappings - | GetMemory => Err(VmmActionError::OperationNotSupportedPreBoot), + | GetMemory + | GetMemoryDirty => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -786,6 +791,7 @@ impl RuntimeApiController { .map_err(VmmActionError::MemoryHotplugUpdate), GetMemoryMappings => self.get_guest_memory_mappings(), GetMemory => self.get_guest_memory_info(), + GetMemoryDirty => self.get_dirty_memory_info(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -984,6 +990,25 @@ impl RuntimeApiController { Ok(VmmData::Memory(MemoryResponse { resident, empty })) } + + /// Get dirty pages information for guest memory + fn get_dirty_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let bitmap = vmm.get_dirty_memory(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get dirty memory' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::MemoryDirty(MemoryDirty { bitmap })) + } } #[cfg(test)] diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 4cc7640fd74..4179be93fec 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -9,6 +9,8 @@ pub mod net; pub mod signal; /// Module with state machine pub mod sm; +/// Module with pagemap utilities +pub mod pagemap; use std::fs::{File, OpenOptions}; use std::num::Wrapping; diff --git a/src/vmm/src/utils/pagemap.rs b/src/vmm/src/utils/pagemap.rs new file mode 100644 index 00000000000..fff9e1f5cb2 --- /dev/null +++ b/src/vmm/src/utils/pagemap.rs @@ -0,0 +1,115 @@ +//! Utilities for reading /proc/self/pagemap to track dirty pages. + +#![allow(clippy::cast_possible_wrap)] + +use std::fs::File; +use std::os::unix::io::AsRawFd; + +use crate::arch::host_page_size; + +const PAGEMAP_ENTRY_SIZE: usize = 8; + +/// Errors related to pagemap operations +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PagemapError { + /// Failed to open /proc/self/pagemap: {0} + OpenPagemap(#[source] std::io::Error), + /// Failed to read pagemap entry: {0} + ReadEntry(#[source] std::io::Error), + /// Failed to open /proc/self/clear_refs: {0} + OpenClearRefs(#[source] std::io::Error), + /// Failed to clear soft-dirty bits: {0} + ClearSoftDirty(#[source] std::io::Error), +} + +/// Represents a single entry in /proc/pid/pagemap. +/// +/// Each virtual page has an 8-byte entry with the following layout: +/// - Bits 0-54: Page frame number (PFN) if present +/// - Bit 55: Page is soft-dirty (written to since last clear) +/// - Bit 56: Page is exclusively mapped +/// - Bit 57: Page is write-protected via userfaultfd +/// - Bit 58: Unused +/// - Bit 59-60: Unused +/// - Bit 61: Page is file-page or shared-anon +/// - Bit 62: Page is swapped +/// - Bit 63: Page is present in RAM +#[derive(Debug, Clone, Copy)] +pub struct PagemapEntry { + raw: u64, +} + +impl PagemapEntry { + /// Create a PagemapEntry from bytes (little-endian) + pub fn from_bytes(bytes: [u8; 8]) -> Self { + Self { + raw: u64::from_ne_bytes(bytes), + } + } + + /// Check if page is write-protected via userfaultfd + pub fn is_write_protected(&self) -> bool { + (self.raw & (1u64 << 57)) != 0 + } + + /// Check if page is present in RAM (bit 63) + pub fn is_present(&self) -> bool { + (self.raw & (1u64 << 63)) != 0 + } +} + +/// Reader for /proc/self/pagemap +#[derive(Debug)] +pub struct PagemapReader { + pagemap_fd: File, +} + +impl PagemapReader { + /// Create a new PagemapReader + pub fn new(_page_size: usize) -> Result { + let pagemap_fd = File::open("/proc/self/pagemap").map_err(PagemapError::OpenPagemap)?; + + Ok(Self { pagemap_fd }) + } + + /// Check if a single page is dirty (write-protected bit cleared). + /// + /// Checks the first host page (4K) of the guest page at the given address. + /// For huge pages, all host pages within the huge page typically have the same + /// dirty status, so sampling the first is sufficient. + /// + /// # Arguments + /// * `virt_addr` - Virtual address of the page to check + /// + /// # Returns + /// True if the page is present and write-protected bit is cleared (dirty). + pub fn is_page_dirty(&self, virt_addr: usize) -> Result { + // Pagemap always uses host (4K) page size + let host_page_size = host_page_size(); + + // Calculate offset for this virtual page (using host page size) + let host_vpn = virt_addr / host_page_size; + let offset = (host_vpn * PAGEMAP_ENTRY_SIZE) as i64; + + let mut entry_bytes = [0u8; 8]; + + // SAFETY: pread is safe as long as the fd is valid and the buffer is properly sized + let ret = unsafe { + libc::pread( + self.pagemap_fd.as_raw_fd(), + entry_bytes.as_mut_ptr().cast(), + PAGEMAP_ENTRY_SIZE, + offset, + ) + }; + + if ret != PAGEMAP_ENTRY_SIZE as isize { + return Err(PagemapError::ReadEntry(std::io::Error::last_os_error())); + } + + let entry = PagemapEntry::from_bytes(entry_bytes); + + // Page must be present and the write_protected bit cleared (indicating it was written to) + Ok(entry.is_present() && !entry.is_write_protected()) + } +} diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs index 1169eb432c2..693ece6b4d4 100644 --- a/src/vmm/src/vmm_config/meminfo.rs +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -19,3 +19,11 @@ pub struct MemoryResponse { /// Each bit represents whether a page is empty (all 0s). pub empty: Vec, } + +/// Information about dirty guest memory pages +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryDirty { + /// Bitmap for dirty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page has been written since the last snapshot. + pub bitmap: Vec, +} From e9febb1f128feaa1f783a3c5282d0313321911a4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 12 Feb 2026 14:51:29 -0800 Subject: [PATCH 48/53] feat: enable write-protection on guest memory UFFD provides an API to enable write-protection for memory ranges tracked by a userfault file descriptor. Detailed information can be found here: https://docs.kernel.org/admin-guide/mm/userfaultfd.html. To use the feature, users need to register the memory region with UFFDIO_REGISTER_MODE_WP. Then, users need to enable explicitly write-protection for sub-ranges of the registered region. Writes in pages within write-protected memory ranges can be handled in one of two ways. In synchronous mode, writes in a protected page will cause kernel to send a write protection event over the userfaultfd. In asynchronous mode, the kernel will automatically handle writes to protected pages by clearing the write-protection bit. Userspace can later observe the write protection bit by looking into the corresponding entry of /proc//pagemap. This commit, uncoditionally, enables write protection for guest memory using the asynchronous mode. !NOTE!: asynchronous write protection requires (host) kernel version 6.7 or later). Signed-off-by: Babis Chalios --- Cargo.lock | 29 ++++++++++++++++++++++++++--- src/vmm/Cargo.toml | 6 +++++- src/vmm/src/lib.rs | 3 ++- src/vmm/src/persist.rs | 26 ++++++++++++++++++++++---- 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 009f6ce23da..c159666eefb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -533,7 +533,7 @@ dependencies = [ "serde_json", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "utils", "vmm", "vmm-sys-util", @@ -1399,7 +1399,20 @@ dependencies = [ "libc", "nix", "thiserror 1.0.69", - "userfaultfd-sys", + "userfaultfd-sys 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "userfaultfd" +version = "0.9.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "libc", + "nix", + "thiserror 1.0.69", + "userfaultfd-sys 0.6.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", ] [[package]] @@ -1413,6 +1426,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "userfaultfd-sys" +version = "0.6.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bindgen", + "cc", + "cfg-if", +] + [[package]] name = "utf8parse" version = "0.2.2" @@ -1531,7 +1554,7 @@ dependencies = [ "slab", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", "utils", "uuid", "vhost", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index a09109251ac..8aa23f3037c 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -47,7 +47,11 @@ serde_json = "1.0.145" slab = "0.4.11" thiserror = "2.0.17" timerfd = "1.5.0" -userfaultfd = "0.9.0" +userfaultfd = { git = "https://github.com/e2b-dev/userfaultfd-rs", branch = "feat_write_protection", features = [ + "linux5_7", + "linux5_13", + "linux6_7" +] } utils = { path = "../utils" } uuid = "1.18.1" vhost = { version = "0.15.0", features = ["vhost-user-frontend"] } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 06144a5ddd9..b0031931200 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -795,7 +795,8 @@ impl Vmm { // Use mincore_bitmap to get resident pages at guest page size granularity let resident_bitmap = vstate::vm::mincore_bitmap(base_addr as *mut u8, len, page_size)?; - // TODO: if we don't support UFFD/async WP, we can completely skip this bit. For the + // TODO: if we don't support UFFD/async WP, we can completely skip this bit, as the + // UFFD handler already tracks dirty pages through the WriteProtected events. For the // time being, we always do. // // Build dirty bitmap: check pagemap only for pages that mincore reports resident. diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 29be397ad5d..ae6b837cf09 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -14,7 +14,7 @@ use std::sync::{Arc, Mutex}; use semver::Version; use serde::{Deserialize, Serialize}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; use vmm_sys_util::sock_ctrl_msg::ScmSocket; #[cfg(target_arch = "aarch64")] @@ -484,6 +484,8 @@ pub enum GuestMemoryFromUffdError { Create(userfaultfd::Error), /// Failed to register memory address range with the userfaultfd object: {0} Register(userfaultfd::Error), + /// Failed to enable write protection on memory address range with the userfaultfd object: {0} + WriteProtect(userfaultfd::Error), /// Failed to connect to UDS Unix stream: {0} Connect(#[from] std::io::Error), /// Failed to sends file descriptor: {0} @@ -505,7 +507,9 @@ fn guest_memory_from_uffd( // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't // actively change the behavior of UFFD, only passively. Without balloon devices // we never call madvise anyway, so no need to put this into a conditional. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + uffd_builder.require_features( + FeatureFlags::EVENT_REMOVE | FeatureFlags::MISSING_HUGETLBFS | FeatureFlags::WP_ASYNC, + ); let uffd = uffd_builder .close_on_exec(true) @@ -515,8 +519,22 @@ fn guest_memory_from_uffd( .map_err(GuestMemoryFromUffdError::Create)?; for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; + uffd.register_with_mode( + mem_region.as_ptr().cast(), + mem_region.size() as _, + RegisterMode::MISSING | RegisterMode::WRITE_PROTECT, + ) + .map_err(GuestMemoryFromUffdError::Register)?; + + // If memory is backed by huge pages, we can immediately write protect it. + // Otherwise (memory is backed by anonymous memory), write protecting here + // won't have any effect, as the write-protection bit for a bitwill be + // wiped when the first page fault occurs. These cases need to be handled + // directly from the UFFD handler. + if huge_pages.is_hugetlbfs() { + uffd.write_protect(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::WriteProtect)?; + } } send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; From fff6fd9d8ae5e6334f1d2fc239e5514a36f7d8ee Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Feb 2026 16:00:06 -0800 Subject: [PATCH 49/53] ci: remove dependency changes test This is an optional test on the Firecracker side and most of the times it's ignored (when valid dependency changes happen). Having this fail blocks our fc-versions releases. Signed-off-by: Babis Chalios --- .../workflows/dependency_modification_check.yml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .github/workflows/dependency_modification_check.yml diff --git a/.github/workflows/dependency_modification_check.yml b/.github/workflows/dependency_modification_check.yml deleted file mode 100644 index ac6537af102..00000000000 --- a/.github/workflows/dependency_modification_check.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Check no dependencies were modified - -on: pull_request - -jobs: - dependency_changed_check: - runs-on: ubuntu-latest - steps: - - name: "Checkout repository" - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: "Check Cargo.lock not in changeset" - run: | - git fetch origin - git diff origin/$GITHUB_BASE_REF.. --name-only| ( ! grep "Cargo.lock") From a284adfef05e4f4c5e414fed28e460371bb399b7 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 24 Feb 2026 14:18:48 +0100 Subject: [PATCH 50/53] feat: make network device snapshots backwards compatible TODO Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/net/persist.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index ba56cc39aac..aed1912a69e 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,6 +30,16 @@ pub struct NetConfigSpaceState { guest_mac: Option, } +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct RxBufferState { + // Number of iovecs we have parsed from the guest + parsed_descriptor_chains_nr: u16, + // Number of used descriptors + used_descriptors: u16, + // Number of used bytes + used_bytes: u32, +} + /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -42,6 +52,7 @@ pub struct NetState { pub mmds_ns: Option, config_space: NetConfigSpaceState, pub virtio_state: VirtioDeviceState, + rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -84,6 +95,7 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), + rx_buffers_state: RxBufferState::default(), } } @@ -128,6 +140,10 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; + if state.virtio_state.activated { + net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; + } + Ok(net) } } From 7a2ef60bbc2d28cba1154790769cde8769e6d080 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 26 Feb 2026 15:42:27 +0100 Subject: [PATCH 51/53] snapshot: add state types for previous versions Add descriptions for MicovmState from previous Firecracker versions. Moreover, add methods to translate a snapshot file from previous versions in the current one. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vm.rs | 15 +- src/vmm/src/device_manager/persist.rs | 4 +- src/vmm/src/devices/acpi/mod.rs | 2 +- src/vmm/src/devices/acpi/vmclock.rs | 2 +- src/vmm/src/devices/virtio/balloon/device.rs | 2 +- src/vmm/src/devices/virtio/balloon/persist.rs | 42 +- .../virtio/block/vhost_user/persist.rs | 16 +- .../devices/virtio/block/virtio/persist.rs | 14 +- src/vmm/src/devices/virtio/net/persist.rs | 8 +- src/vmm/src/devices/virtio/persist.rs | 12 +- src/vmm/src/devices/virtio/rng/persist.rs | 2 +- src/vmm/src/{persist.rs => persist/mod.rs} | 4 + src/vmm/src/persist/v1_10/aarch64.rs | 45 ++ src/vmm/src/persist/v1_10/mod.rs | 154 +++++ src/vmm/src/persist/v1_10/x86_64.rs | 55 ++ src/vmm/src/persist/v1_12/aarch64.rs | 64 ++ src/vmm/src/persist/v1_12/mod.rs | 435 +++++++++++++ src/vmm/src/persist/v1_12/x86_64.rs | 46 ++ src/vmm/src/persist/v1_14/aarch64.rs | 201 ++++++ src/vmm/src/persist/v1_14/mod.rs | 593 ++++++++++++++++++ src/vmm/src/persist/v1_14/x86_64.rs | 93 +++ 21 files changed, 1752 insertions(+), 57 deletions(-) rename src/vmm/src/{persist.rs => persist/mod.rs} (99%) create mode 100644 src/vmm/src/persist/v1_10/aarch64.rs create mode 100644 src/vmm/src/persist/v1_10/mod.rs create mode 100644 src/vmm/src/persist/v1_10/x86_64.rs create mode 100644 src/vmm/src/persist/v1_12/aarch64.rs create mode 100644 src/vmm/src/persist/v1_12/mod.rs create mode 100644 src/vmm/src/persist/v1_12/x86_64.rs create mode 100644 src/vmm/src/persist/v1_14/aarch64.rs create mode 100644 src/vmm/src/persist/v1_14/mod.rs create mode 100644 src/vmm/src/persist/v1_14/x86_64.rs diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 1bcd092b725..92a48252e78 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -236,13 +236,18 @@ pub struct VmState { pub memory: GuestMemoryState, /// resource allocator pub resource_allocator: ResourceAllocator, - pitstate: kvm_pit_state2, - clock: kvm_clock_data, + /// KVM interrupt timer + pub pitstate: kvm_pit_state2, + /// KVM clock data + pub clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_master: kvm_irqchip, + /// Master PIC controller + pub pic_master: kvm_irqchip, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_slave: kvm_irqchip, - ioapic: kvm_irqchip, + /// Slave PIC controller + pub pic_slave: kvm_irqchip, + /// IOAPIC + pub ioapic: kvm_irqchip, } impl fmt::Debug for VmState { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..8e815bf8b6c 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -168,9 +168,9 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { - vmgenid: VMGenIDState, + pub vmgenid: VMGenIDState, #[cfg(target_arch = "x86_64")] - vmclock: VmClockState, + pub vmclock: VmClockState, } impl<'a> Persist<'a> for ACPIDeviceManager { diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 8eba26ac41d..4e8c62922e6 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -1,6 +1,6 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod generated; +pub mod generated; pub mod vmclock; pub mod vmgenid; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..56aee6e44d4 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -22,7 +22,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index a5095b44f67..411b84bc7be 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -78,7 +78,7 @@ unsafe impl ByteValued for ConfigSpace {} /// Holds state of the free page hinting run #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] -pub(crate) struct HintingState { +pub struct HintingState { /// The command requested by us. Set to STOP by default. pub host_cmd: u32, /// The last command supplied by guest. diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 2314a98aa33..f044c99494b 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -31,22 +31,22 @@ pub struct BalloonConfigSpaceState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonStatsState { - swap_in: Option, - swap_out: Option, - major_faults: Option, - minor_faults: Option, - free_memory: Option, - total_memory: Option, - available_memory: Option, - disk_caches: Option, - hugetlb_allocations: Option, - hugetlb_failures: Option, - oom_kill: Option, - alloc_stall: Option, - async_scan: Option, - direct_scan: Option, - async_reclaim: Option, - direct_reclaim: Option, + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, + pub oom_kill: Option, + pub alloc_stall: Option, + pub async_scan: Option, + pub direct_scan: Option, + pub async_reclaim: Option, + pub direct_reclaim: Option, } impl BalloonStatsState { @@ -101,11 +101,11 @@ impl BalloonStatsState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonState { - stats_polling_interval_s: u16, - stats_desc_index: Option, - latest_stats: BalloonStatsState, - config_space: BalloonConfigSpaceState, - hinting_state: HintingState, + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub hinting_state: HintingState, pub virtio_state: VirtioDeviceState, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs index d507fa9577b..230e6caf47b 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs @@ -15,14 +15,14 @@ use crate::snapshot::Persist; /// vhost-user block device state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VhostUserBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - socket_path: String, - vu_acked_protocol_features: u64, - config_space: Vec, - virtio_state: VirtioDeviceState, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, } impl Persist<'_> for VhostUserBlock { diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 380fe1de0e8..98f17c258ad 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -53,14 +53,14 @@ impl From for FileEngineType { /// Holds info about the block device. Gets saved in snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - disk_path: String, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, - file_engine_type: FileEngineTypeState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, } impl Persist<'_> for VirtioBlock { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index aed1912a69e..1af7a2cc081 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -46,13 +46,13 @@ pub struct RxBufferState { pub struct NetState { pub id: String, pub tap_if_name: String, - rx_rate_limiter_state: RateLimiterState, - tx_rate_limiter_state: RateLimiterState, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, /// The associated MMDS network stack. pub mmds_ns: Option, - config_space: NetConfigSpaceState, + pub config_space: NetConfigSpaceState, pub virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 85c4940f305..4306b60961b 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -194,13 +194,13 @@ impl VirtioDeviceState { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MmioTransportState { // The register where feature bits are stored. - features_select: u32, + pub features_select: u32, // The register where features page is selected. - acked_features_select: u32, - queue_select: u32, - device_status: u32, - config_generation: u32, - interrupt_status: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, + pub interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 27df145eb81..e841af4926b 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -20,7 +20,7 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, + pub rate_limiter_state: RateLimiterState, } #[derive(Debug)] diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist/mod.rs similarity index 99% rename from src/vmm/src/persist.rs rename to src/vmm/src/persist/mod.rs index ae6b837cf09..337d1389f65 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist/mod.rs @@ -43,6 +43,10 @@ use crate::vstate::vcpu::{VcpuSendEventError, VcpuState}; use crate::vstate::vm::{VmError, VmState}; use crate::{EventManager, Vmm, vstate}; +pub(crate) mod v1_10; +pub(crate) mod v1_12; +pub(crate) mod v1_14; + /// Holds information related to the VM that is not part of VmState. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Eq, Serialize)] pub struct VmInfo { diff --git a/src/vmm/src/persist/v1_10/aarch64.rs b/src/vmm/src/persist/v1_10/aarch64.rs new file mode 100644 index 00000000000..ff7ab011a78 --- /dev/null +++ b/src/vmm/src/persist/v1_10/aarch64.rs @@ -0,0 +1,45 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Serialize}; + +use super::{KvmCapability, MMIODeviceInfo}; + +// Types that are identical across all versions — canonical definitions in v1_14. +pub use crate::v1_14::{ + StaticCpuTemplate, + DeviceType, + GicRegState, + VgicSysRegsState, + GicVcpuState, + Aarch64RegisterVec, +}; + +// Types that are identical in v1.10 and v1.12 — canonical definitions in v1_12. +pub use crate::v1_12::{ + // aarch64 GicState is identical in v1.10 and v1.12 (gains its_state in v1.14) + GicState, + // aarch64 VcpuState is identical in v1.10 and v1.12 (gains pvtime_ipa in v1.14) + VcpuState, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 legacy device info (v1.10 layout: uses v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub gic: GicState, + pub kvm_cap_modifiers: Vec, +} diff --git a/src/vmm/src/persist/v1_10/mod.rs b/src/vmm/src/persist/v1_10/mod.rs new file mode 100644 index 00000000000..f95ce37bdca --- /dev/null +++ b/src/vmm/src/persist/v1_10/mod.rs @@ -0,0 +1,154 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.10 (snapshot format version 4.0.0). +//! +//! Types that are identical to v1.14 are imported from that module (the canonical source). +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are imported +//! from v1.12 (the canonical source for that version pair). +//! Only types that are truly v1.10-specific are defined here. +//! +//! Key differences from v1.12: +//! - `GuestMemoryRegionState` includes an `offset` field (removed in v1.11) +//! - `MMIODeviceInfo` uses `irqs: Vec` (changed to `irq: Option` in v1.11) +//! - `VmState` (both arches) has `kvm_cap_modifiers` instead of `memory` +//! - `MicrovmState` has `memory_state: GuestMemoryState` at the top level (not inside VmState) +//! - x86_64 `VcpuState.xsave` is `kvm_xsave` (changed to `Xsave` in v1.12) +//! - No `KvmState` wrapper struct + +use serde::{Deserialize, Serialize}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Types identical to v1.12 — imported from that module (canonical source) +// ─────────────────────────────────────────────────────────────────── + +use crate::persist::VmInfo; + +pub use super::v1_12::{ + // ACPI device manager state (used in MicrovmState defined below) + ACPIDeviceManagerState, + BalloonState, + // Device inner states (used in Connected* wrappers defined below) + BlockState, + EntropyState, + // MMDS version (used in DeviceStates defined below) + MmdsVersionState, + // Virtio transport state (used in Connected* wrappers defined below) + MmioTransportState, + NetState, + VsockState, +}; + +// ─────────────────────────────────────────────────────────────────── +// MMIO device info (v1.10 uses `irqs: Vec`, changed to `irq: Option` in v1.11) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irqs: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers (use v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.10 layout) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.10: GuestMemoryRegionState has `offset` field) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, + /// File offset into the memory snapshot file (present in v1.10, removed in v1.11) + pub offset: u64, +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.10) +// Note: `memory_state` is at this level (not inside VmState), and there is no `kvm_state`. +// The kvm_cap_modifiers field lives inside VmState. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + pub vm_info: VmInfo, + pub memory_state: GuestMemoryState, + pub vm_state: VmState, + pub vcpu_states: Vec, + pub device_states: DeviceStates, + pub acpi_dev_state: ACPIDeviceManagerState, +} diff --git a/src/vmm/src/persist/v1_10/x86_64.rs b/src/vmm/src/persist/v1_10/x86_64.rs new file mode 100644 index 00000000000..d66d1c36eec --- /dev/null +++ b/src/vmm/src/persist/v1_10/x86_64.rs @@ -0,0 +1,55 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + CpuId, Msrs, kvm_clock_data, kvm_debugregs, kvm_irqchip, kvm_lapic_state, kvm_mp_state, + kvm_pit_state2, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, kvm_xsave, +}; +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, + pub kvm_cap_modifiers: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (x86_64, v1.10) +// xsave is kvm_xsave (not Xsave/FamStructWrapper) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Serialize, Deserialize)] +pub struct VcpuState { + pub cpuid: CpuId, + pub saved_msrs: Vec, + pub debug_regs: kvm_debugregs, + pub lapic: kvm_lapic_state, + pub mp_state: kvm_mp_state, + pub regs: kvm_regs, + pub sregs: kvm_sregs, + pub vcpu_events: kvm_vcpu_events, + pub xcrs: kvm_xcrs, + /// In v1.10, xsave is stored as kvm_xsave (4096-byte opaque blob). + /// In v1.12+, it became Xsave = FamStructWrapper to support Intel AMX. + pub xsave: kvm_xsave, + pub tsc_khz: Option, +} + +impl std::fmt::Debug for VcpuState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VcpuState") + .field("tsc_khz", &self.tsc_khz) + .finish_non_exhaustive() + } +} diff --git a/src/vmm/src/persist/v1_12/aarch64.rs b/src/vmm/src/persist/v1_12/aarch64.rs new file mode 100644 index 00000000000..7c66c2f1dae --- /dev/null +++ b/src/vmm/src/persist/v1_12/aarch64.rs @@ -0,0 +1,64 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; +use serde::{Deserialize, Serialize}; + +use super::{GuestMemoryState, MMIODeviceInfo}; + +// Types that are canonical in v1_14 and unchanged through all versions +pub use crate::v1_14::{ + // Legacy device type enum + DeviceType, + // GIC helper types (GicState itself changed — its_state added — so redefined in v1_14) + GicRegState, + VgicSysRegsState, + GicVcpuState, + // Register vector with custom serde + Aarch64RegisterVec, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 GIC types (identical to v1.10; its_state added in v1.14) +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicState { + pub dist: Vec>, + pub gic_vcpu_states: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.10 = v1.12) +// Canonical definition is here; v1.10 imports from this module. +// Gains `pvtime_ipa` in v1.14. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VcpuState { + pub mp_state: kvm_mp_state, + pub regs: Aarch64RegisterVec, + pub mpidr: u64, + pub kvi: kvm_vcpu_init, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub gic: GicState, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 ConnectedLegacyState uses updated MMIODeviceInfo +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} diff --git a/src/vmm/src/persist/v1_12/mod.rs b/src/vmm/src/persist/v1_12/mod.rs new file mode 100644 index 00000000000..85ba0d00b31 --- /dev/null +++ b/src/vmm/src/persist/v1_12/mod.rs @@ -0,0 +1,435 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.12 (snapshot format version 6.0.0). +//! +//! Types that are structurally identical to v1.14 are imported from that module. +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are defined +//! here as the canonical source; v1.10 imports them from this module. +//! Only types that are truly v1.12-specific are also defined here. +//! +//! Changes from v1.10: +//! - `MMIODeviceInfo`: `irqs: Vec` → `irq: Option` (v1.11) +//! - `GuestMemoryRegionState`: `offset` field removed (v1.11) +//! - `VmState`: memory moved here from `MicrovmState`, `kvm_cap_modifiers` moved to `KvmState` +//! - x86_64 `VcpuState.xsave`: `kvm_xsave` → `Xsave` (v1.12) +//! - `KvmState`: new wrapper for `kvm_cap_modifiers` +//! - `MicrovmState`: adds `kvm_state`, removes `memory_state` + +use serde::{Deserialize, Serialize}; + +use super::v1_10; +use crate::arch::VcpuState; +use crate::devices::acpi::vmgenid::VMGenIDState; +use crate::devices::virtio::balloon::persist::BalloonConfigSpaceState; +use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::block::virtio::persist::FileEngineTypeState; +use crate::devices::virtio::net::persist::{NetConfigSpaceState, RxBufferState}; +use crate::devices::virtio::persist::QueueState; +use crate::devices::virtio::vsock::persist::VsockBackendState; +use crate::mmds::persist::MmdsNetworkStackState; +use crate::persist::VmInfo; +use crate::rate_limiter::persist::RateLimiterState; +use crate::vstate::kvm::KvmState; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Shared simple types — same in v1.10 and v1.12; differs in v1.14 +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct VirtioDeviceState { + pub device_type: u32, + pub avail_features: u64, + pub acked_features: u64, + pub queues: Vec, + pub interrupt_status: u32, + pub activated: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MmioTransportState { + pub features_select: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, +} + +// ─────────────────────────────────────────────────────────────────── +// Block device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VhostUserBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum BlockState { + Virtio(VirtioBlockState), + VhostUser(VhostUserBlockState), +} + +// ─────────────────────────────────────────────────────────────────── +// Net device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetState { + pub id: String, + pub tap_if_name: String, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, + pub mmds_ns: Option, + pub config_space: NetConfigSpaceState, + pub virtio_state: VirtioDeviceState, + pub rx_buffers_state: RxBufferState, +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockFrontendState { + pub cid: u64, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockState { + pub backend: VsockBackendState, + pub frontend: VsockFrontendState, +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct BalloonStatsState { + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BalloonState { + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub virtio_state: VirtioDeviceState, +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EntropyState { + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS +// ─────────────────────────────────────────────────────────────────── + +/// MMDS version (renamed to `MmdsVersion` and restructured in v1.14). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum MmdsVersionState { + V1, + V2, +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI devices state (same as v1.10; vmgenid becomes mandatory in v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct ACPIDeviceManagerState { + pub vmgenid: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: irqs: Vec → irq: Option +// ─────────────────────────────────────────────────────────────────── + +/// MMIO device info. +/// +/// Note: stored as `Option` in Firecracker source, but `NonZeroU32` has +/// the same bincode wire format as `u32`, so we use `Option` here. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irq: Option, +} + +impl MMIODeviceInfo { + pub(crate) fn from(old: v1_10::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old.addr, + len: old.len, + // v1.10 stored a Vec of IRQs; v1.11+ uses a single optional IRQ. + // In practice exactly one IRQ was always present for devices that have one. + irq: old.irqs.into_iter().next(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: `offset` field removed from GuestMemoryRegionState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, +} + +impl From for GuestMemoryRegionState { + fn from(old: v1_10::GuestMemoryRegionState) -> Self { + // Drop the `offset` field which was removed in v1.11. + GuestMemoryRegionState { + base_address: old.base_address, + size: old.size, + } + } +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +impl From for GuestMemoryState { + fn from(old: v1_10::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old + .regions + .into_iter() + .map(GuestMemoryRegionState::from) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers — redefined because MMIODeviceInfo changed. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +impl From for DeviceStates { + fn from(old: v1_10::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old + .legacy_devices + .into_iter() + .map(|ld| ConnectedLegacyState { + type_: ld.type_, + device_info: MMIODeviceInfo::from(ld.device_info), + }) + .collect(), + block_devices: old + .block_devices + .into_iter() + .map(|d| ConnectedBlockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + net_devices: old + .net_devices + .into_iter() + .map(|d| ConnectedNetState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + vsock_device: old.vsock_device.map(|d| ConnectedVsockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + balloon_device: old.balloon_device.map(|d| ConnectedBalloonState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + mmds_version: old.mmds_version, + entropy_device: old.entropy_device.map(|d| ConnectedEntropyState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.12) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + /// Imported from v1_14; unchanged through all versions. + pub vm_info: VmInfo, + /// Imported from v1_14; wraps `kvm_cap_modifiers`, extracted from v1.10's `VmState`. + pub kvm_state: KvmState, + /// Redefined in v1.12: `memory` moved in from top-level `MicrovmState.memory_state`, + /// `kvm_cap_modifiers` moved out to `KvmState`. Redefined again in v1.14: gains + /// `resource_allocator`; `GuestMemoryRegionState` gains `region_type` and `plugged`. + pub vm_state: VmState, + /// x86_64: redefined here (`xsave` type changed from `kvm_xsave` to `Xsave`); + /// imported into v1.14 (same type). + /// aarch64: canonical definition here (same as v1.10; gains `pvtime_ipa` in v1.14). + pub vcpu_states: Vec, + /// Redefined here: all `ConnectedXxxState` wrappers rebuilt because `MMIODeviceInfo` + /// changed (`irqs: Vec` → `irq: Option`). Inner device states (BlockState, + /// NetState, etc.) are defined in this module as the v1.10/v1.12 canonical source. + pub device_states: DeviceStates, + /// Defined in this module as the v1.10/v1.12 canonical source. Redefined in v1.14: + /// `vmgenid` becomes mandatory, x86_64 gains `vmclock`; moved inside + /// `DevicesState.acpi_state` (no longer top-level). + pub acpi_dev_state: ACPIDeviceManagerState, +} + +impl From for MicrovmState { + fn from(old: v1_10::MicrovmState) -> Self { + // In v1.10, kvm_cap_modifiers lives in VmState; in v1.12 it moves to KvmState. + // KvmCapability is the same type in all versions (imported from v1_14). + let kvm_cap_modifiers = old.vm_state.kvm_cap_modifiers; + + let memory = GuestMemoryState::from(old.memory_state); + + #[cfg(target_arch = "x86_64")] + let vm_state = VmState { + memory, + pitstate: old.vm_state.pitstate, + clock: old.vm_state.clock, + pic_master: old.vm_state.pic_master, + pic_slave: old.vm_state.pic_slave, + ioapic: old.vm_state.ioapic, + }; + + #[cfg(target_arch = "aarch64")] + let vm_state = VmState { + memory, + gic: old.vm_state.gic, + }; + + // x86_64: xsave type changed from kvm_xsave → Xsave, needs conversion. + // aarch64: VcpuState is identical in v1.10 and v1.12 (v1_12 is canonical source). + #[cfg(target_arch = "x86_64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + #[cfg(target_arch = "aarch64")] + let vcpu_states = old.vcpu_states; + + MicrovmState { + vm_info: old.vm_info, + kvm_state: KvmState { kvm_cap_modifiers }, + vm_state, + vcpu_states, + device_states: DeviceStates::from(old.device_states), + acpi_dev_state: old.acpi_dev_state, + } + } +} diff --git a/src/vmm/src/persist/v1_12/x86_64.rs b/src/vmm/src/persist/v1_12/x86_64.rs new file mode 100644 index 00000000000..912bb10b7ab --- /dev/null +++ b/src/vmm/src/persist/v1_12/x86_64.rs @@ -0,0 +1,46 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_clock_data, kvm_irqchip, kvm_pit_state2}; +use serde::{Deserialize, Serialize}; + +use crate::{arch::VcpuState, persist::v1_14::x86_64::xsave_from_v1_10}; + +use super::{GuestMemoryState, v1_10}; + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: xsave type changed from kvm_xsave → Xsave +// VcpuState is defined in v1_14 (same in v1.12 and v1.14); conversion from v1.10 is here. +// ─────────────────────────────────────────────────────────────────── + +impl VcpuState { + pub(crate) fn from(old: v1_10::VcpuState) -> VcpuState { + VcpuState { + cpuid: old.cpuid, + saved_msrs: old.saved_msrs, + debug_regs: old.debug_regs, + lapic: old.lapic, + mp_state: old.mp_state, + regs: old.regs, + sregs: old.sregs, + vcpu_events: old.vcpu_events, + xcrs: old.xcrs, + xsave: xsave_from_v1_10(old.xsave), + tsc_khz: old.tsc_khz, + } + } +} diff --git a/src/vmm/src/persist/v1_14/aarch64.rs b/src/vmm/src/persist/v1_14/aarch64.rs new file mode 100644 index 00000000000..f203ea6fee6 --- /dev/null +++ b/src/vmm/src/persist/v1_14/aarch64.rs @@ -0,0 +1,201 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; +use serde::{Deserialize, Serialize}; + +use crate::convert::{ConvertError, irq_to_gsi}; +use crate::v1_12; + +use super::{ + ACPIDeviceManagerState, GuestMemoryState, MMIODeviceInfo, ResourceAllocator, VMGenIDState, +}; + +// ─────────────────────────────────────────────────────────────────── +// StaticCpuTemplate — canonical definition (identical in v1.10, v1.12, v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum StaticCpuTemplate { + V1N1, + #[default] + None, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 legacy device types — canonical definitions +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum DeviceType { + Virtio(u32), + Serial, + Rtc, +} + +// ─────────────────────────────────────────────────────────────────── +// GIC helper types — canonical definitions (unchanged since v1.10) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(bound(serialize = "T: Serialize", deserialize = "T: for<'a> Deserialize<'a>"))] +pub struct GicRegState Deserialize<'a>> { + pub chunks: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VgicSysRegsState { + pub main_icc_regs: Vec>, + pub ap_icc_regs: Vec>>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicVcpuState { + pub rdist: Vec>, + pub icc: VgicSysRegsState, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 register vector — canonical definition (unchanged since v1.10) +// ─────────────────────────────────────────────────────────────────── + +/// aarch64 register vector with custom serde: serialized as (Vec, Vec) +#[derive(Debug, Clone)] +pub struct Aarch64RegisterVec { + pub ids: Vec, + pub data: Vec, +} + +impl Serialize for Aarch64RegisterVec { + fn serialize(&self, serializer: S) -> Result { + (&self.ids, &self.data).serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for Aarch64RegisterVec { + fn deserialize>(deserializer: D) -> Result { + let (ids, data) = <(Vec, Vec)>::deserialize(deserializer)?; + Ok(Aarch64RegisterVec { ids, data }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 ConnectedLegacyState (uses updated MMIODeviceInfo with gsi) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} + +impl From for ConnectedLegacyState { + fn from(s: v1_12::ConnectedLegacyState) -> Self { + ConnectedLegacyState { + type_: s.type_, + device_info: MMIODeviceInfo::from(s.device_info), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 GIC state (v1.14: adds its_state) +// GicRegState, VgicSysRegsState, GicVcpuState are defined above +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ItsRegisterState { + pub iidr: u64, + pub cbaser: u64, + pub creadr: u64, + pub cwriter: u64, + pub baser: [u64; 8], + pub ctlr: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicState { + pub dist: Vec>, + pub gic_vcpu_states: Vec, + /// ITS state (GICv3 only). None for GICv2 or when converted from v1.12. + pub its_state: Option, +} + +impl GicState { + pub(crate) fn from(old_state: v1_12::GicState) -> GicState { + GicState { + dist: old_state.dist, + gic_vcpu_states: old_state.gic_vcpu_states, + its_state: None, // v1.12 had no ITS support + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.14: gains pvtime_ipa) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VcpuState { + pub mp_state: kvm_mp_state, + pub regs: Aarch64RegisterVec, + pub mpidr: u64, + pub kvi: kvm_vcpu_init, + pub pvtime_ipa: Option, +} + +impl VcpuState { + pub(crate) fn from(old_state: v1_12::VcpuState) -> VcpuState { + VcpuState { + mp_state: old_state.mp_state, + regs: old_state.regs, + mpidr: old_state.mpidr, + kvi: old_state.kvi, + pvtime_ipa: None, // new in v1.14; default to None (not configured) + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state impl (aarch64: no vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + _resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + Ok(ACPIDeviceManagerState { + vmgenid: VMGenIDState { + // v1.12 aarch64 uses IRQ_BASE=32-based numbers; v1.14 uses 0-based GSIs + gsi: irq_to_gsi(vmgenid.gsi), + addr: vmgenid.addr, + }, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.14: adds resource_allocator) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub gic: GicState, + pub resource_allocator: ResourceAllocator, +} + +impl VmState { + pub(crate) fn from( + old_state: v1_12::VmState, + resource_allocator: ResourceAllocator, + ) -> VmState { + VmState { + memory: GuestMemoryState::from(old_state.memory), + gic: GicState::from(old_state.gic), + resource_allocator, + } + } +} diff --git a/src/vmm/src/persist/v1_14/mod.rs b/src/vmm/src/persist/v1_14/mod.rs new file mode 100644 index 00000000000..dcaddb6e8c4 --- /dev/null +++ b/src/vmm/src/persist/v1_14/mod.rs @@ -0,0 +1,593 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.14 (snapshot format version 8.0.0). +//! +//! This module is the **canonical source** for types shared across all snapshot versions. +//! Older modules (v1_12, v1_10) import unchanged types from here rather than defining +//! their own copies. +//! +//! Types that are unique to v1.14 or changed from v1.12: +//! - `VirtioDeviceState`: `interrupt_status` removed (moved to `MmioTransportState`) +//! - `MmioTransportState`: gains `interrupt_status` +//! - `MMIODeviceInfo`: `irq` → `gsi` +//! - `NetState`: `rx_buffers_state` retained +//! - `BalloonStatsState`: 6 new fields +//! - `BalloonState`: gains `hinting_state` +//! - aarch64 `GicState`: gains `its_state` +//! - aarch64 `VcpuState`: gains `pvtime_ipa` +//! - `GuestMemoryRegionState`: gains `region_type` and `plugged` +//! - `ACPIDeviceManagerState`: vmgenid now mandatory, adds vmclock (x86_64) +//! - New types: `ConnectedDeviceState`, `DevicesState`, `ResourceAllocator`, +//! `PmemState`, `VirtioMemState`, `MmdsState`, `GuestRegionType`, etc. + +use vm_allocator::{AddressAllocator, AllocPolicy, IdAllocator}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +use crate::arch::{ + FIRST_ADDR_PAST_64BITS_MMIO, GSI_LEGACY_END, GSI_LEGACY_START, GSI_MSI_END, GSI_MSI_START, + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PAST_64BITS_MMIO_SIZE, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, VmState, +}; +use crate::device_manager::DevicesState; +use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevicesState; +use crate::device_manager::persist::{ + ACPIDeviceManagerState, DeviceStates, MmdsState, VirtioDeviceState as ConnectedDeviceState, +}; +use crate::devices::acpi::vmgenid::VMGENID_MEM_SIZE; +use crate::devices::virtio::balloon::device::HintingState; +use crate::devices::virtio::balloon::persist::{BalloonState, BalloonStatsState}; +use crate::devices::virtio::block::persist::BlockState; +use crate::devices::virtio::block::vhost_user::persist::VhostUserBlockState; +use crate::devices::virtio::block::virtio::persist::VirtioBlockState; +use crate::devices::virtio::net::persist::NetState; +use crate::devices::virtio::persist::{MmioTransportState, VirtioDeviceState}; +use crate::devices::virtio::rng::persist::EntropyState; +use crate::devices::virtio::vsock::persist::{VsockFrontendState, VsockState}; +use crate::mmds::data_store::MmdsVersion; +use crate::persist::{MicrovmState, v1_12}; +use crate::vstate::memory::{GuestMemoryRegionState, GuestMemoryState, GuestRegionType}; +use crate::vstate::resources::ResourceAllocator; + +#[derive(Debug, thiserror::Error)] +pub enum ConvertError { + #[error("VMGenID state is missing; cannot convert snapshot (v1.12 snapshot must have VMGenID)")] + MissingVmGenId, + #[error("vm-allocator error during ResourceAllocator reconstruction: {0}")] + Allocator(#[from] vm_allocator::Error), + #[error("ResourceAllocator reconstruction failed: duplicate/invalid MMIO address 0x{0:x}")] + DuplicateAddress(u64), + #[error("ResourceAllocator reconstruction failed: GSI {0} out of expected range")] + #[allow(dead_code)] + GsiOutOfRange(u32), +} + +// In v1.12 x86_64, IRQ_BASE = 5 = GSI_LEGACY_START. No conversion needed. +// This constant exists for symmetry with the aarch64 SPI_START offset. +pub const SPI_START: u32 = 0; // no-op offset for x86_64 + +/// Convert a v1.12 IRQ number to a v1.14 GSI number. +/// +/// x86_64: IRQ_BASE (5) == GSI_LEGACY_START (5) — no transformation needed. +/// aarch64: IRQ_BASE (32) != GSI_LEGACY_START (0) — subtract SPI_START (32). +pub(crate) fn irq_to_gsi(irq: u32) -> u32 { + irq.saturating_sub(SPI_START) +} + +impl VirtioDeviceState { + /// Convert v1.12 VirtioDeviceState → v1.14 VirtioDeviceState. + /// + /// With v1.14, the `interrupt_status` moves from [`VirtioDeviceState`] to [`MmioTransportState`]. + /// That's why we don't use `From` here, so we can return + /// `interrupt_status` separately. + pub(crate) fn from(old_state: v1_12::VirtioDeviceState) -> (Self, u32) { + let interrupt_status = old_state.interrupt_status; + let new_state = VirtioDeviceState { + device_type: old_state.device_type, + avail_features: old_state.avail_features, + acked_features: old_state.acked_features, + queues: old_state.queues, // QueueState is the same type (re-exported v1_10 → v1_12 → v1_14) + activated: old_state.activated, + }; + (new_state, interrupt_status) + } +} + +/// Convert v1.12 MmioTransportState → v1.14 MmioTransportState with interrupt_status. +impl MmioTransportState { + pub(crate) fn from(old_state: v1_12::MmioTransportState, interrupt_status: u32) -> Self { + MmioTransportState { + features_select: old_state.features_select, + acked_features_select: old_state.acked_features_select, + queue_select: old_state.queue_select, + device_status: old_state.device_status, + config_generation: old_state.config_generation, + interrupt_status, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.14: irq → gsi +// ─────────────────────────────────────────────────────────────────── +impl MMIODeviceInfo { + /// Convert v1.12 MMIODeviceInfo → v1.14 MMIODeviceInfo. + /// irq (Option, same wire format as Option) → gsi: Option + pub(crate) fn from(old_state: v1_12::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old_state.addr, + len: old_state.len, + gsi: old_state.irq.map(irq_to_gsi), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Block device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl VirtioBlockState { + pub(crate) fn from(old_state: v1_12::VirtioBlockState) -> (VirtioBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VirtioBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + disk_path: old_state.disk_path, + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + file_engine_type: old_state.file_engine_type, + }; + (new, interrupt_status) + } +} + +impl VhostUserBlockState { + pub(crate) fn from(old_state: v1_12::VhostUserBlockState) -> (VhostUserBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VhostUserBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + socket_path: old_state.socket_path, + vu_acked_protocol_features: old_state.vu_acked_protocol_features, + config_space: old_state.config_space, + virtio_state, + }; + (new, interrupt_status) + } +} + +impl BlockState { + pub(crate) fn from(old_state: v1_12::BlockState) -> (BlockState, u32) { + match old_state { + v1_12::BlockState::Virtio(b) => { + let (new, irq) = VirtioBlockState::from(b); + (BlockState::Virtio(new), irq) + } + v1_12::BlockState::VhostUser(b) => { + let (new, irq) = VhostUserBlockState::from(b); + (BlockState::VhostUser(new), irq) + } + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS — MmdsVersionState renamed/restructured to MmdsState +// ─────────────────────────────────────────────────────────────────── +impl MmdsVersion { + pub(crate) fn from(old_state: v1_12::MmdsVersionState) -> MmdsVersion { + match old_state { + v1_12::MmdsVersionState::V1 => MmdsVersion::V1, + v1_12::MmdsVersionState::V2 => MmdsVersion::V2, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Net device — changed: VirtioDeviceState changed; rx_buffers_state retained +// ─────────────────────────────────────────────────────────────────── +impl NetState { + pub(crate) fn from(old_state: v1_12::NetState) -> (NetState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = NetState { + id: old_state.id, + tap_if_name: old_state.tap_if_name, + rx_rate_limiter_state: old_state.rx_rate_limiter_state, + tx_rate_limiter_state: old_state.tx_rate_limiter_state, + mmds_ns: old_state.mmds_ns, + config_space: old_state.config_space, + virtio_state, + rx_buffers_state: old_state.rx_buffers_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device — VsockFrontendState/VsockState redefined (VirtioDeviceState changed) +// VsockUdsState and VsockBackendState are unchanged and defined above +// ─────────────────────────────────────────────────────────────────── +impl VsockState { + pub(crate) fn from(old_state: v1_12::VsockState) -> (VsockState, u32) { + let (virtio_state, interrupt_status) = + VirtioDeviceState::from(old_state.frontend.virtio_state); + let new = VsockState { + backend: old_state.backend, + frontend: VsockFrontendState { + cid: old_state.frontend.cid, + virtio_state, + }, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device — BalloonStatsState gains 6 new fields; BalloonState gains hinting_state +// ─────────────────────────────────────────────────────────────────── +impl BalloonStatsState { + pub(crate) fn from(old_state: v1_12::BalloonStatsState) -> BalloonStatsState { + BalloonStatsState { + swap_in: old_state.swap_in, + swap_out: old_state.swap_out, + major_faults: old_state.major_faults, + minor_faults: old_state.minor_faults, + free_memory: old_state.free_memory, + total_memory: old_state.total_memory, + available_memory: old_state.available_memory, + disk_caches: old_state.disk_caches, + hugetlb_allocations: old_state.hugetlb_allocations, + hugetlb_failures: old_state.hugetlb_failures, + oom_kill: None, + alloc_stall: None, + async_scan: None, + direct_scan: None, + async_reclaim: None, + direct_reclaim: None, + } + } +} + +impl BalloonState { + pub(crate) fn from(old_state: v1_12::BalloonState) -> (BalloonState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = BalloonState { + stats_polling_interval_s: old_state.stats_polling_interval_s, + stats_desc_index: old_state.stats_desc_index, + latest_stats: BalloonStatsState::from(old_state.latest_stats), + config_space: old_state.config_space, + hinting_state: HintingState { + host_cmd: 0, + last_cmd_id: 0, + guest_cmd: None, + // Default: acknowledge on finish (matches firecracker's `default_ack_on_stop()`) + acknowledge_on_finish: true, + }, + virtio_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl EntropyState { + pub(crate) fn from(old_state: v1_12::EntropyState) -> (EntropyState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = EntropyState { + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + }; + (new, interrupt_status) + } +} + +macro_rules! convert_connected_state { + ($old_type:ty, $new_type:ty) => { + impl From<$old_type> for ConnectedDeviceState<$new_type> { + fn from(old_type: $old_type) -> Self { + let (device_state, interrupt_status) = <$new_type>::from(old_type.device_state); + let transport_state = + MmioTransportState::from(old_type.transport_state, interrupt_status); + ConnectedDeviceState { + device_id: old_type.device_id, + device_state, + transport_state, + device_info: MMIODeviceInfo::from(old_type.device_info), + } + } + } + }; +} + +convert_connected_state!(v1_12::ConnectedBlockState, BlockState); +convert_connected_state!(v1_12::ConnectedNetState, NetState); +convert_connected_state!(v1_12::ConnectedVsockState, VsockState); +convert_connected_state!(v1_12::ConnectedBalloonState, BalloonState); +convert_connected_state!(v1_12::ConnectedEntropyState, EntropyState); + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.14 layout) +// ─────────────────────────────────────────────────────────────────── + +impl From for DeviceStates { + fn from(old_state: v1_12::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old_state + .legacy_devices + .into_iter() + .map(ConnectedLegacyState::from) + .collect(), + block_devices: old_state + .block_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + net_devices: old_state + .net_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + vsock_device: old_state + .vsock_device + .map(ConnectedDeviceState::::from), + balloon_device: old_state + .balloon_device + .map(ConnectedDeviceState::::from), + mmds: old_state.mmds_version.map(|v| MmdsState { + version: MmdsVersion::from(v), + imds_compat: false, + }), + entropy_device: old_state + .entropy_device + .map(ConnectedDeviceState::::from), + // pmem and memory devices are new in v1.14, not present in v1.12 + pmem_devices: Vec::new(), + memory_device: None, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.14: region_type and plugged added) +// ─────────────────────────────────────────────────────────────────── +impl From for GuestMemoryState { + fn from(old_state: v1_12::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old_state + .regions + .into_iter() + .map(|r| GuestMemoryRegionState { + base_address: r.base_address, + size: r.size, + // v1.12 snapshots don't have memory hotplug, all regions are Dram + region_type: GuestRegionType::Dram, + // No slots were plugged/unplugged; Dram regions have a single slot + // of size == region size, so there's 1 plugged slot + plugged: vec![true], + }) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ResourceAllocator (new in v1.14) +// ─────────────────────────────────────────────────────────────────── +impl ResourceAllocator { + /// Reconstruct the v1.14 ResourceAllocator from v1.12 device information. + /// + /// In v1.12, the ResourceAllocator state wasn't persisted; in v1.14 it is. + /// We reconstruct it by marking all allocations that were made during VM setup. + pub(crate) fn from( + device_states: &v1_12::DeviceStates, + acpi_state: &v1_12::ACPIDeviceManagerState, + ) -> Result { + // Initialize fresh allocators matching ResourceAllocator::new() + let mut gsi_legacy = + IdAllocator::new(GSI_LEGACY_START, GSI_LEGACY_END).map_err(ConvertError::Allocator)?; + let mut gsi_msi = + IdAllocator::new(GSI_MSI_START, GSI_MSI_END).map_err(ConvertError::Allocator)?; + let mut mmio32 = AddressAllocator::new(MEM_32BIT_DEVICES_START, MEM_32BIT_DEVICES_SIZE) + .map_err(ConvertError::Allocator)?; + + // 64-bit MMIO space + let mmio64_start = MEM_64BIT_DEVICES_START; + let mmio64_size = MEM_64BIT_DEVICES_SIZE; + let mmio64 = + AddressAllocator::new(mmio64_start, mmio64_size).map_err(ConvertError::Allocator)?; + + // Past 64-bit MMIO space + let past_mmio64_start = FIRST_ADDR_PAST_64BITS_MMIO; + let past_mmio64_size = PAST_64BITS_MMIO_SIZE; + let past_mmio64 = AddressAllocator::new(past_mmio64_start, past_mmio64_size) + .map_err(ConvertError::Allocator)?; + + // System memory allocator + let mut system_mem = AddressAllocator::new(SYSTEM_MEM_START, SYSTEM_MEM_SIZE) + .map_err(ConvertError::Allocator)?; + + // Collect all used GSIs and MMIO addresses from devices + let mut used_legacy_gsis: Vec = Vec::new(); + let mut used_msi_gsis: Vec = Vec::new(); + let mut used_mmio32_addrs: Vec<(u64, u64)> = Vec::new(); // (addr, len) + + // Helper to classify and record a device's MMIODeviceInfo. + // On aarch64, v1.12 stores IRQ numbers starting from IRQ_BASE=32 (physical SPI), + // while v1.14 uses 0-based GSI numbers. We convert with irq_to_gsi(). + // Also: only record MMIO addresses within the v1.14 mmio32_memory range + // [MEM_32BIT_DEVICES_START, ...). Addresses below that (serial, RTC, early virtio + // devices allocated from v1.12's single MMIO allocator) are not tracked by the + // v1.14 mmio32_memory allocator and must be skipped. + let mut record_device_info = |info: &v1_12::MMIODeviceInfo| { + if let Some(irq) = info.irq { + let gsi = irq_to_gsi(irq); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } else if (GSI_MSI_START..=GSI_MSI_END).contains(&gsi) { + used_msi_gsis.push(gsi); + } + } + // Only record addresses within the v1.14 mmio32_memory range + if info.addr >= MEM_32BIT_DEVICES_START { + used_mmio32_addrs.push((info.addr, info.len)); + } + }; + + for dev in &device_states.block_devices { + record_device_info(&dev.device_info); + } + for dev in &device_states.net_devices { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.vsock_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.balloon_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.entropy_device { + record_device_info(&dev.device_info); + } + + #[cfg(target_arch = "aarch64")] + for dev in &device_states.legacy_devices { + record_device_info(&dev.device_info); + } + + // Also account for VMGenID's legacy GSI. + // v1.12 stores IRQ_BASE-based values; convert to v1.14 0-based GSI. + if let Some(vmgenid) = &acpi_state.vmgenid { + let gsi = irq_to_gsi(vmgenid.gsi); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } + } + + // Reconstruct legacy GSI allocator + // IdAllocator allocates sequentially. To reconstruct it, we allocate IDs up to + // max(used_ids) and free the ones we didn't use. + if !used_legacy_gsis.is_empty() { + let max_gsi = *used_legacy_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = + used_legacy_gsis.iter().cloned().collect(); + + // Allocate all IDs from start to max + let mut allocated = Vec::new(); + for id in GSI_LEGACY_START..=max_gsi { + let got = gsi_legacy.allocate_id().map_err(ConvertError::Allocator)?; + allocated.push(got); + assert_eq!(got, id, "IdAllocator must allocate sequentially"); + } + // Free the ones not in use + for id in GSI_LEGACY_START..=max_gsi { + if !used_set.contains(&id) { + gsi_legacy.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct MSI GSI allocator (similarly) + if !used_msi_gsis.is_empty() { + let max_gsi = *used_msi_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = used_msi_gsis.iter().cloned().collect(); + + for id in GSI_MSI_START..=max_gsi { + let got = gsi_msi.allocate_id().map_err(ConvertError::Allocator)?; + assert_eq!(got, id); + } + for id in GSI_MSI_START..=max_gsi { + if !used_set.contains(&id) { + gsi_msi.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct 32-bit MMIO allocator + // Each MMIO device was allocated with FirstMatch policy, so they were assigned + // sequentially. We use ExactMatch to mark each address as used. + for (addr, len) in &used_mmio32_addrs { + mmio32 + .allocate(*len, 1, AllocPolicy::ExactMatch(*addr)) + .map_err(|_| ConvertError::DuplicateAddress(*addr))?; + } + + // Reconstruct system memory allocator. + // In v1.12, VMGenID was allocated with LastMatch (highest addr in system_memory). + // VmClock (x86_64 only, new in v1.14) will be allocated in ACPIDeviceManagerState::from + // using LastMatch, which will place it just below the VMGenID region. + // We mark the VMGenID address as used here so the VmClock allocation in + // ACPIDeviceManagerState::from gets the correct (lower) address. + if let Some(vmgenid) = &acpi_state.vmgenid { + system_mem + .allocate(VMGENID_MEM_SIZE, 8, AllocPolicy::ExactMatch(vmgenid.addr)) + .map_err(|_| ConvertError::DuplicateAddress(vmgenid.addr))?; + } + + Ok(ResourceAllocator { + gsi_legacy_allocator: gsi_legacy, + gsi_msi_allocator: gsi_msi, + mmio32_memory: mmio32, + mmio64_memory: mmio64, + past_mmio64_memory: past_mmio64, + system_memory: system_mem, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.14) +// ─────────────────────────────────────────────────────────────────── +impl TryFrom for MicrovmState { + type Error = ConvertError; + + fn try_from(old: v1_12::MicrovmState) -> Result { + // Reconstruct ResourceAllocator from device info + let mut resource_allocator = + ResourceAllocator::from(&old.device_states, &old.acpi_dev_state)?; + + // Convert ACPI state (also allocates VmClock from resource_allocator on x86_64) + let acpi_state = ACPIDeviceManagerState::from(old.acpi_dev_state, &mut resource_allocator)?; + + // Convert device states + let mmio_state = DeviceStates::from(old.device_states); + + let device_states = DevicesState { + mmio_state, + acpi_state, + pci_state: PciDevicesState::default(), + }; + + // Convert VM state (embeds the reconstructed resource allocator) + let vm_state = VmState::from(old.vm_state, resource_allocator); + + // x86_64: VcpuState is the same type in v1.12 and v1.14. + // aarch64: VcpuState gains pvtime_ipa field, needs conversion. + #[cfg(target_arch = "x86_64")] + let vcpu_states = old.vcpu_states; + #[cfg(target_arch = "aarch64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + + Ok(MicrovmState { + vm_info: old.vm_info, + kvm_state: old.kvm_state, + vm_state, + vcpu_states, + device_states, + }) + } +} diff --git a/src/vmm/src/persist/v1_14/x86_64.rs b/src/vmm/src/persist/v1_14/x86_64.rs new file mode 100644 index 00000000000..d772c78016e --- /dev/null +++ b/src/vmm/src/persist/v1_14/x86_64.rs @@ -0,0 +1,93 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::kvm_xsave; +use vm_allocator::AllocPolicy; + +use super::v1_12; +use crate::devices::acpi::generated::vmclock_abi::{ + VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, +}; +use crate::{ + arch::VmState, + devices::acpi::vmclock::{VMCLOCK_SIZE, VmClockState}, + persist::v1_14::ConvertError, +}; + +use super::{ACPIDeviceManagerState, GuestMemoryState, ResourceAllocator}; + +pub use kvm_bindings::Xsave; + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state impl (x86_64: allocates vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + + // Allocate VmClock from system memory using LastMatch (same as VmClock::new()) + // VmClock must be allocated after VMGenID in the system memory allocator reconstruction. + let vmclock_addr = resource_allocator + .system_memory + .allocate( + VMCLOCK_SIZE as u64, + VMCLOCK_SIZE as u64, + AllocPolicy::LastMatch, + ) + .map_err(ConvertError::Allocator)? + .start(); + + let vmclock = VmClockState { + guest_address: vmclock_addr, + inner: vmclock_abi { + magic: VMCLOCK_MAGIC, + size: VMCLOCK_SIZE, + version: 1, + clock_status: VMCLOCK_STATUS_UNKNOWN, + counter_id: VMCLOCK_COUNTER_INVALID, + ..Default::default() + }, + }; + + Ok(ACPIDeviceManagerState { vmgenid, vmclock }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.14: adds resource_allocator) +// ─────────────────────────────────────────────────────────────────── +impl VmState { + pub(crate) fn from(s: v1_12::VmState, resource_allocator: ResourceAllocator) -> VmState { + VmState { + memory: GuestMemoryState::from(s.memory), + resource_allocator, + pitstate: s.pitstate, + clock: s.clock, + pic_master: s.pic_master, + pic_slave: s.pic_slave, + ioapic: s.ioapic, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Helper used by v1_12::VcpuState::from(v1_10::VcpuState) +// ─────────────────────────────────────────────────────────────────── + +/// Convert a v1.10 `kvm_xsave` into a v1.12/v1.14 `Xsave` (= `FamStructWrapper`). +/// +/// v1.12 introduced `Xsave` to support Intel AMX extended save state (extra FAM entries). +/// A snapshot from v1.10 has no AMX state, so `len = 0` (zero FAM entries). +pub(crate) fn xsave_from_v1_10(old: kvm_xsave) -> Xsave { + let mut xsave = Xsave::new(0).expect("failed to allocate Xsave wrapper"); + // SAFETY: We only overwrite the `xsave` sub-field, not `len`, so the + // FamStructWrapper length invariant is preserved. + unsafe { + xsave.as_mut_fam_struct().xsave = old; + } + xsave +} From ee3f6fa6da46ca3c3f3ff3a4572bead2064bc243 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 27 Feb 2026 14:27:36 +0100 Subject: [PATCH 52/53] feat: allow loading older snapshots Now that we have logic for translating snapshot formats, we can allow the /snapshot/load API to parse v1.10 and v1.12 snapshots. We change the logic that parses the snapshot file to first read the version from the file and then (if needed) translate it to the expected v1.14 version. Currently older versions supported are v1.10 and v1.12. Signed-off-by: Babis Chalios --- src/vmm/src/persist/mod.rs | 35 +++++++++++++++++++++++++++++++---- src/vmm/src/snapshot/mod.rs | 13 ++++--------- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/src/vmm/src/persist/mod.rs b/src/vmm/src/persist/mod.rs index 337d1389f65..6a728f44a67 100644 --- a/src/vmm/src/persist/mod.rs +++ b/src/vmm/src/persist/mod.rs @@ -11,6 +11,7 @@ use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::path::Path; use std::sync::{Arc, Mutex}; +use std::time::Instant; use semver::Version; use serde::{Deserialize, Serialize}; @@ -29,7 +30,7 @@ use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; -use crate::snapshot::Snapshot; +use crate::snapshot::{Snapshot, SnapshotError, SnapshotHdr}; use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -452,10 +453,36 @@ pub enum SnapshotStateFromFileError { fn snapshot_state_from_file( snapshot_path: &Path, ) -> Result { - let mut snapshot_reader = File::open(snapshot_path)?; - let snapshot = Snapshot::load(&mut snapshot_reader)?; + let start = Instant::now(); - Ok(snapshot.data) + let data = std::fs::read(snapshot_path)?; + let version = SnapshotHdr::load(&mut data.as_slice())?.version; + + let mut snapshot_reader = data.as_slice(); + let data = match (version.major, version.minor) { + (8, 0) => Snapshot::load(&mut snapshot_reader)?.data, + (6, 0) => { + let v12_state = Snapshot::::load(&mut snapshot_reader)?; + MicrovmState::try_from(v12_state.data).unwrap() + } + (4, 0) => { + let v10_state = Snapshot::::load(&mut snapshot_reader)?; + let v12_state = v1_12::MicrovmState::from(v10_state.data); + MicrovmState::try_from(v12_state).unwrap() + } + _ => { + return Err(SnapshotStateFromFileError::Load( + SnapshotError::InvalidFormatVersion(version), + )); + } + }; + + info!( + "Loading snapshot file took {} usec", + start.elapsed().as_micros() + ); + + Ok(data) } /// Error type for [`guest_memory_from_file`]. diff --git a/src/vmm/src/snapshot/mod.rs b/src/vmm/src/snapshot/mod.rs index 76b5203298d..360b823712b 100644 --- a/src/vmm/src/snapshot/mod.rs +++ b/src/vmm/src/snapshot/mod.rs @@ -81,26 +81,21 @@ fn serialize(data: &S, write: &mut W) -> Result<(), Snap /// Firecracker snapshot header #[derive(Debug, Serialize, Deserialize)] -struct SnapshotHdr { +pub struct SnapshotHdr { /// magic value - magic: u64, + pub magic: u64, /// Snapshot data version - version: Version, + pub version: Version, } impl SnapshotHdr { - fn load(buf: &mut &[u8]) -> Result { + pub(crate) fn load(buf: &mut &[u8]) -> Result { let (hdr, bytes_read) = bincode::serde::decode_from_slice::(buf, BINCODE_CONFIG)?; if hdr.magic != SNAPSHOT_MAGIC_ID { return Err(SnapshotError::InvalidMagic(hdr.magic)); } - if hdr.version.major != SNAPSHOT_VERSION.major || hdr.version.minor > SNAPSHOT_VERSION.minor - { - return Err(SnapshotError::InvalidFormatVersion(hdr.version)); - } - *buf = &buf[bytes_read..]; Ok(hdr) From 458ca91761c4335441fb8438c9e51b5b8fc2649c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 23 Mar 2026 11:45:28 +0000 Subject: [PATCH 53/53] fix: compilation in aarch64 Changes we did for supporting older snapshot formats, did not really compile on ARM systems. Fix the compilation issues. The issues were mainly bad re-exports. Signed-off-by: Babis Chalios --- .../arch/aarch64/gic/gicv3/regs/its_regs.rs | 2 +- .../src/arch/aarch64/gic/gicv3/regs/mod.rs | 10 +- src/vmm/src/arch/aarch64/gic/mod.rs | 3 +- src/vmm/src/arch/aarch64/gic/regs.rs | 13 +- src/vmm/src/persist/v1_10/aarch64.rs | 14 +- src/vmm/src/persist/v1_12/aarch64.rs | 3 +- src/vmm/src/persist/v1_12/mod.rs | 1 + src/vmm/src/persist/v1_14/aarch64.rs | 129 +++++------------- src/vmm/src/persist/v1_14/mod.rs | 6 +- 9 files changed, 61 insertions(+), 120 deletions(-) diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs index ee4ecafba1e..8f898a09301 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -80,7 +80,7 @@ pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { } /// ITS registers that we save/restore during snapshot -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct ItsRegisterState { iidr: u64, cbaser: u64, diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 3df0d4642d7..914fdf45d76 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -57,9 +57,13 @@ pub fn restore_state( icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always - // `true`. - state.its_state.as_ref().unwrap().restore(its_device) + // `its_state` is `None` when loading a snapshot created by an older Firecracker version that + // did not save ITS state. In that case, skip ITS restore and leave the ITS in its reset + // state; the guest kernel will re-initialize it. + if let Some(its_state) = &state.its_state { + its_state.restore(its_device)?; + } + Ok(()) } #[cfg(test)] diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index 9bfabee1fea..0fe0aa899b3 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -8,7 +8,8 @@ mod regs; use gicv2::GICv2; use gicv3::GICv3; use kvm_ioctls::{DeviceFd, VmFd}; -pub use regs::GicState; +pub use regs::{GicRegState, GicState, GicVcpuState, VgicSysRegsState}; +pub use gicv3::regs::its_regs::ItsRegisterState; use super::layout; diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 1afa7acde9c..d05b4568904 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -12,20 +12,23 @@ use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; -#[derive(Debug, Serialize, Deserialize)] +/// Serializable state for a block of GIC registers. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct GicRegState { pub(crate) chunks: Vec, } /// Structure for serializing the state of the Vgic ICC regs -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct VgicSysRegsState { + /// Main ICC system registers. pub main_icc_regs: Vec>, + /// AP ICC system registers (one entry per priority group). pub ap_icc_regs: Vec>>, } /// Structure used for serializing the state of the GIC registers. -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct GicState { /// The state of the distributor registers. pub dist: Vec>, @@ -36,9 +39,11 @@ pub struct GicState { } /// Structure used for serializing the state of the GIC registers for a specific vCPU. -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct GicVcpuState { + /// Redistributor registers for this vCPU. pub rdist: Vec>, + /// ICC (CPU interface) system registers for this vCPU. pub icc: VgicSysRegsState, } diff --git a/src/vmm/src/persist/v1_10/aarch64.rs b/src/vmm/src/persist/v1_10/aarch64.rs index ff7ab011a78..c85896a0b32 100644 --- a/src/vmm/src/persist/v1_10/aarch64.rs +++ b/src/vmm/src/persist/v1_10/aarch64.rs @@ -3,20 +3,14 @@ use serde::{Deserialize, Serialize}; -use super::{KvmCapability, MMIODeviceInfo}; +use crate::cpu_config::templates::KvmCapability; +use super::MMIODeviceInfo; // Types that are identical across all versions — canonical definitions in v1_14. -pub use crate::v1_14::{ - StaticCpuTemplate, - DeviceType, - GicRegState, - VgicSysRegsState, - GicVcpuState, - Aarch64RegisterVec, -}; +pub use crate::persist::v1_14::DeviceType; // Types that are identical in v1.10 and v1.12 — canonical definitions in v1_12. -pub use crate::v1_12::{ +pub use crate::persist::v1_12::{ // aarch64 GicState is identical in v1.10 and v1.12 (gains its_state in v1.14) GicState, // aarch64 VcpuState is identical in v1.10 and v1.12 (gains pvtime_ipa in v1.14) diff --git a/src/vmm/src/persist/v1_12/aarch64.rs b/src/vmm/src/persist/v1_12/aarch64.rs index 7c66c2f1dae..f57079ea6ae 100644 --- a/src/vmm/src/persist/v1_12/aarch64.rs +++ b/src/vmm/src/persist/v1_12/aarch64.rs @@ -7,12 +7,11 @@ use serde::{Deserialize, Serialize}; use super::{GuestMemoryState, MMIODeviceInfo}; // Types that are canonical in v1_14 and unchanged through all versions -pub use crate::v1_14::{ +pub use crate::persist::v1_14::{ // Legacy device type enum DeviceType, // GIC helper types (GicState itself changed — its_state added — so redefined in v1_14) GicRegState, - VgicSysRegsState, GicVcpuState, // Register vector with custom serde Aarch64RegisterVec, diff --git a/src/vmm/src/persist/v1_12/mod.rs b/src/vmm/src/persist/v1_12/mod.rs index 85ba0d00b31..32cf5bd7635 100644 --- a/src/vmm/src/persist/v1_12/mod.rs +++ b/src/vmm/src/persist/v1_12/mod.rs @@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize}; use super::v1_10; +#[cfg(target_arch = "x86_64")] use crate::arch::VcpuState; use crate::devices::acpi::vmgenid::VMGenIDState; use crate::devices::virtio::balloon::persist::BalloonConfigSpaceState; diff --git a/src/vmm/src/persist/v1_14/aarch64.rs b/src/vmm/src/persist/v1_14/aarch64.rs index f203ea6fee6..8ac93332eb9 100644 --- a/src/vmm/src/persist/v1_14/aarch64.rs +++ b/src/vmm/src/persist/v1_14/aarch64.rs @@ -1,18 +1,25 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; use serde::{Deserialize, Serialize}; -use crate::convert::{ConvertError, irq_to_gsi}; -use crate::v1_12; +use super::{ACPIDeviceManagerState, ConvertError, GuestMemoryState, MMIODeviceInfo, + ResourceAllocator, irq_to_gsi}; +use crate::devices::acpi::vmgenid::VMGenIDState; +use crate::persist::v1_12; -use super::{ - ACPIDeviceManagerState, GuestMemoryState, MMIODeviceInfo, ResourceAllocator, VMGenIDState, -}; +// ─────────────────────────────────────────────────────────────────── +// Re-export runtime types — v1.14 snapshot format matches the runtime format. +// These are used by v1.12 (and v1.10 via v1.12) as canonical type definitions. +// ─────────────────────────────────────────────────────────────────── + +pub use crate::arch::aarch64::gic::{GicRegState, GicState, GicVcpuState}; +pub use crate::arch::aarch64::regs::Aarch64RegisterVec; +pub use crate::arch::aarch64::vcpu::VcpuState; +pub use crate::arch::aarch64::vm::VmState; // ─────────────────────────────────────────────────────────────────── -// StaticCpuTemplate — canonical definition (identical in v1.10, v1.12, v1.14) +// StaticCpuTemplate — aarch64-specific snapshot enum (same in v1.10, v1.12, v1.14) // ─────────────────────────────────────────────────────────────────── #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -23,7 +30,7 @@ pub enum StaticCpuTemplate { } // ─────────────────────────────────────────────────────────────────── -// aarch64 legacy device types — canonical definitions +// DeviceType — aarch64 legacy device type enum (snapshot format) // ─────────────────────────────────────────────────────────────────── #[derive(Debug, Clone, Serialize, Deserialize)] @@ -33,94 +40,34 @@ pub enum DeviceType { Rtc, } -// ─────────────────────────────────────────────────────────────────── -// GIC helper types — canonical definitions (unchanged since v1.10) -// ─────────────────────────────────────────────────────────────────── - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(bound(serialize = "T: Serialize", deserialize = "T: for<'a> Deserialize<'a>"))] -pub struct GicRegState Deserialize<'a>> { - pub chunks: Vec, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VgicSysRegsState { - pub main_icc_regs: Vec>, - pub ap_icc_regs: Vec>>, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GicVcpuState { - pub rdist: Vec>, - pub icc: VgicSysRegsState, -} - -// ─────────────────────────────────────────────────────────────────── -// aarch64 register vector — canonical definition (unchanged since v1.10) -// ─────────────────────────────────────────────────────────────────── - -/// aarch64 register vector with custom serde: serialized as (Vec, Vec) -#[derive(Debug, Clone)] -pub struct Aarch64RegisterVec { - pub ids: Vec, - pub data: Vec, -} - -impl Serialize for Aarch64RegisterVec { - fn serialize(&self, serializer: S) -> Result { - (&self.ids, &self.data).serialize(serializer) - } -} - -impl<'de> Deserialize<'de> for Aarch64RegisterVec { - fn deserialize>(deserializer: D) -> Result { - let (ids, data) = <(Vec, Vec)>::deserialize(deserializer)?; - Ok(Aarch64RegisterVec { ids, data }) +impl From for crate::arch::DeviceType { + fn from(dt: DeviceType) -> Self { + match dt { + DeviceType::Virtio(n) => crate::arch::DeviceType::Virtio(n), + DeviceType::Serial => crate::arch::DeviceType::Serial, + DeviceType::Rtc => crate::arch::DeviceType::Rtc, + } } } // ─────────────────────────────────────────────────────────────────── -// aarch64 ConnectedLegacyState (uses updated MMIODeviceInfo with gsi) +// ConnectedLegacyState — convert v1.12 snapshot type to runtime type // ─────────────────────────────────────────────────────────────────── -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedLegacyState { - pub type_: DeviceType, - pub device_info: MMIODeviceInfo, -} - -impl From for ConnectedLegacyState { +impl From for crate::device_manager::persist::ConnectedLegacyState { fn from(s: v1_12::ConnectedLegacyState) -> Self { - ConnectedLegacyState { - type_: s.type_, + crate::device_manager::persist::ConnectedLegacyState { + type_: crate::arch::DeviceType::from(s.type_), device_info: MMIODeviceInfo::from(s.device_info), } } } // ─────────────────────────────────────────────────────────────────── -// aarch64 GIC state (v1.14: adds its_state) -// GicRegState, VgicSysRegsState, GicVcpuState are defined above +// GIC state (aarch64, v1.14: adds its_state) +// GicState is the runtime type (re-exported above); conversion from v1.12 is here. // ─────────────────────────────────────────────────────────────────── -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ItsRegisterState { - pub iidr: u64, - pub cbaser: u64, - pub creadr: u64, - pub cwriter: u64, - pub baser: [u64; 8], - pub ctlr: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GicState { - pub dist: Vec>, - pub gic_vcpu_states: Vec, - /// ITS state (GICv3 only). None for GICv2 or when converted from v1.12. - pub its_state: Option, -} - impl GicState { pub(crate) fn from(old_state: v1_12::GicState) -> GicState { GicState { @@ -133,17 +80,9 @@ impl GicState { // ─────────────────────────────────────────────────────────────────── // vCPU state (aarch64, v1.14: gains pvtime_ipa) +// VcpuState is the runtime type (re-exported above); conversion from v1.12 is here. // ─────────────────────────────────────────────────────────────────── -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VcpuState { - pub mp_state: kvm_mp_state, - pub regs: Aarch64RegisterVec, - pub mpidr: u64, - pub kvi: kvm_vcpu_init, - pub pvtime_ipa: Option, -} - impl VcpuState { pub(crate) fn from(old_state: v1_12::VcpuState) -> VcpuState { VcpuState { @@ -157,7 +96,7 @@ impl VcpuState { } // ─────────────────────────────────────────────────────────────────── -// ACPI device state impl (aarch64: no vmclock) +// ACPI device state (aarch64: no vmclock) // ─────────────────────────────────────────────────────────────────── impl ACPIDeviceManagerState { @@ -178,15 +117,9 @@ impl ACPIDeviceManagerState { // ─────────────────────────────────────────────────────────────────── // VM state (aarch64, v1.14: adds resource_allocator) +// VmState is the runtime type (re-exported above); conversion from v1.12 is here. // ─────────────────────────────────────────────────────────────────── -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VmState { - pub memory: GuestMemoryState, - pub gic: GicState, - pub resource_allocator: ResourceAllocator, -} - impl VmState { pub(crate) fn from( old_state: v1_12::VmState, diff --git a/src/vmm/src/persist/v1_14/mod.rs b/src/vmm/src/persist/v1_14/mod.rs index dcaddb6e8c4..eb780bbe6f5 100644 --- a/src/vmm/src/persist/v1_14/mod.rs +++ b/src/vmm/src/persist/v1_14/mod.rs @@ -34,14 +34,18 @@ pub use aarch64::*; use crate::arch::{ FIRST_ADDR_PAST_64BITS_MMIO, GSI_LEGACY_END, GSI_LEGACY_START, GSI_MSI_END, GSI_MSI_START, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, - MEM_64BIT_DEVICES_START, PAST_64BITS_MMIO_SIZE, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, VmState, + MEM_64BIT_DEVICES_START, PAST_64BITS_MMIO_SIZE, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, }; +#[cfg(target_arch = "x86_64")] +use crate::arch::VmState; use crate::device_manager::DevicesState; use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevicesState; use crate::device_manager::persist::{ ACPIDeviceManagerState, DeviceStates, MmdsState, VirtioDeviceState as ConnectedDeviceState, }; +#[cfg(target_arch = "aarch64")] +use crate::device_manager::persist::ConnectedLegacyState; use crate::devices::acpi::vmgenid::VMGENID_MEM_SIZE; use crate::devices::virtio::balloon::device::HintingState; use crate::devices::virtio::balloon::persist::{BalloonState, BalloonStatsState};