diff --git a/.github/workflows/dependency_modification_check.yml b/.github/workflows/dependency_modification_check.yml deleted file mode 100644 index ac6537af102..00000000000 --- a/.github/workflows/dependency_modification_check.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Check no dependencies were modified - -on: pull_request - -jobs: - dependency_changed_check: - runs-on: ubuntu-latest - steps: - - name: "Checkout repository" - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: "Check Cargo.lock not in changeset" - run: | - git fetch origin - git diff origin/$GITHUB_BASE_REF.. --name-only| ( ! grep "Cargo.lock") diff --git a/CHANGELOG.md b/CHANGELOG.md index 080adf57257..b6a5ec5d61a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,63 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.14.4] + +### Fixed + +- [#5762](https://github.com/firecracker-microvm/firecracker/pull/5762): Cap + virtio-rng per-request entropy to 64 KiB. Previously, a guest could construct + a descriptor chain that caused Firecracker to allocate more host memory than + the guest actually provided, potentially leading to excessive host memory + consumption. +- [#5818](https://github.com/firecracker-microvm/firecracker/pull/5818): Enforce + the virtio device initialization sequence in the PCI transport, matching the + existing MMIO transport behavior. The PCI transport now validates device + status transitions, rejects queue configuration writes outside the FEATURES_OK + to DRIVER_OK window, rejects feature negotiation outside the DRIVER state, + blocks re-initialization after a failed reset, and sets DEVICE_NEEDS_RESET + when device activation fails. +- [#5818](https://github.com/firecracker-microvm/firecracker/pull/5818): Reject + device status writes that clear previously set bits in the MMIO transport, + except for reset. +- [#5780](https://github.com/firecracker-microvm/firecracker/pull/5780): Fixed + missing `/sys/devices/system/cpu/cpu*/cache/*` in aarch64 guests when running + on host kernels >= 6.3 with guest kernels >= 6.1.156. +- [#5793](https://github.com/firecracker-microvm/firecracker/pull/5793): Fixed + virtio-mem plug/unplug skipping KVM slot updates for memory blocks not aligned + to a slot boundary. On plug, this could leave hotplugged memory inaccessible + to the guest. On unplug, the guest could retain access to memory that + Firecracker considered freed. +- [#5794](https://github.com/firecracker-microvm/firecracker/pull/5794): Bound + balloon statistics descriptor length to prevent a guest-controlled oversized + descriptor from temporarily stalling the VMM event loop. Only affects microVMs + with `stats_polling_interval_s > 0`. +- [#5809](https://github.com/firecracker-microvm/firecracker/pull/5809): Fixed a + bug on host Linux >= 5.16 for x86_64 guests using the `kvm-clock` clock source + causing the monotonic clock to jump on restore by the wall-clock time elapsed + since the snapshot was taken. Users using `kvm-clock` that want to explicitly + advance the clock with `KVM_CLOCK_REALTIME` can opt back in using the new + `clock_realtime` flag in `LoadSnapshot` API. + +## [1.14.3] + +### Fixed + +- [#5739](https://github.com/firecracker-microvm/firecracker/pull/5739): Fixed + validation of TCP SYN options length when MMDS is enabled. + +## [1.14.2] + +### Fixed + +- [#5698](https://github.com/firecracker-microvm/firecracker/pull/5698): Fixed + the possible ENXIO error which could occur during file open operation if the + underlying file is FIFO without active readers already attached. +- [#5705](https://github.com/firecracker-microvm/firecracker/pull/5705): Fixed a + bug that caused Firecracker to corrupt the memory files of differential + snapshots for VMs with multiple memory slots. This affected VMs using memory + hot-plugging or any x86 VMs with a memory size larger than 3GiB. + ## [1.14.1] ### Changed diff --git a/CREDITS.md b/CREDITS.md index abc698944d6..f15b0797896 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -130,6 +130,7 @@ Contributors to the Firecracker repository: - huang-jl <1046678590@qq.com> - Iggy Jackson - ihciah +- Ilias Stamatis - Ioana Chirca - Ishwor Gurung - Iulian Barbu diff --git a/Cargo.lock b/Cargo.lock index 708e381624b..c159666eefb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,27 +89,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "aws-lc-fips-sys" -version = "0.13.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57900537c00a0565a35b63c4c281b372edfc9744b072fd4a3b414350a8f5ed48" -dependencies = [ - "bindgen 0.72.1", - "cc", - "cmake", - "dunce", - "fs_extra", - "regex", -] - [[package]] name = "aws-lc-rs" -version = "1.15.1" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b5ce75405893cd713f9ab8e297d8e438f624dde7d706108285f7e17a25a180f" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ - "aws-lc-fips-sys", "aws-lc-sys", "untrusted", "zeroize", @@ -117,11 +102,10 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.34.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "179c3777a8b5e70e90ea426114ffc565b2c1a9f82f6c4a0c5a34aa6ef5e781b6" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" dependencies = [ - "bindgen 0.72.1", "cc", "cmake", "dunce", @@ -169,27 +153,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", - "shlex", - "syn", -] - -[[package]] -name = "bindgen" -version = "0.72.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" -dependencies = [ - "bitflags 2.10.0", - "cexpr", - "clang-sys", - "itertools 0.12.1", - "log", - "prettyplease", - "proc-macro2", - "quote", - "regex", - "rustc-hash 2.1.1", + "rustc-hash", "shlex", "syn", ] @@ -391,7 +355,7 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" -version = "1.14.1" +version = "1.14.4" dependencies = [ "clap", "displaydoc", @@ -533,7 +497,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -554,7 +518,7 @@ checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "firecracker" -version = "1.14.1" +version = "1.14.4" dependencies = [ "cargo_toml", "displaydoc", @@ -569,7 +533,7 @@ dependencies = [ "serde_json", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "utils", "vmm", "vmm-sys-util", @@ -703,7 +667,7 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jailer" -version = "1.14.1" +version = "1.14.4" dependencies = [ "libc", "log-instrument", @@ -983,16 +947,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn", -] - [[package]] name = "proc-macro2" version = "1.0.103" @@ -1078,7 +1032,7 @@ dependencies = [ [[package]] name = "rebase-snap" -version = "1.14.1" +version = "1.14.4" dependencies = [ "displaydoc", "libc", @@ -1123,12 +1077,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustix" version = "0.38.44" @@ -1152,7 +1100,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.9.4", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -1178,7 +1126,7 @@ dependencies = [ [[package]] name = "seccompiler" -version = "1.14.1" +version = "1.14.4" dependencies = [ "bincode", "clap", @@ -1275,7 +1223,7 @@ checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "snapshot-editor" -version = "1.14.1" +version = "1.14.4" dependencies = [ "clap", "clap-num", @@ -1451,7 +1399,20 @@ dependencies = [ "libc", "nix", "thiserror 1.0.69", - "userfaultfd-sys", + "userfaultfd-sys 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "userfaultfd" +version = "0.9.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "libc", + "nix", + "thiserror 1.0.69", + "userfaultfd-sys 0.6.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", ] [[package]] @@ -1460,7 +1421,17 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc91d95a797a81604af22946d0e86656f27feb0b9665c60665cf3554df12d1a8" dependencies = [ - "bindgen 0.69.5", + "bindgen", + "cc", + "cfg-if", +] + +[[package]] +name = "userfaultfd-sys" +version = "0.6.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bindgen", "cc", "cfg-if", ] @@ -1583,7 +1554,7 @@ dependencies = [ "slab", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", "utils", "uuid", "vhost", @@ -1693,7 +1664,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] diff --git a/deny.toml b/deny.toml index be3a1040463..99d17573d47 100644 --- a/deny.toml +++ b/deny.toml @@ -5,8 +5,7 @@ allow = [ "Apache-2.0", "BSD-3-Clause", "ISC", - "Unicode-3.0", - "OpenSSL" + "Unicode-3.0" ] [[bans.deny]] diff --git a/docs/RELEASE_POLICY.md b/docs/RELEASE_POLICY.md index a5997ddc507..dfc0e1abe51 100644 --- a/docs/RELEASE_POLICY.md +++ b/docs/RELEASE_POLICY.md @@ -90,8 +90,8 @@ v3.1 will be patched since were the last two Firecracker releases and less than | Release | Release Date | Latest Patch | Min. end of support | Official end of Support | | ------: | -----------: | -----------: | ------------------: | :------------------------------ | -| v1.14 | 2025-12-17 | v1.14.0 | 2026-06-17 | Supported | -| v1.13 | 2025-08-28 | v1.13.1 | 2026-02-28 | Supported | +| v1.14 | 2025-12-17 | v1.14.2 | 2026-06-17 | Supported | +| v1.13 | 2025-08-28 | v1.13.2 | 2026-02-28 | Supported | | v1.12 | 2025-05-07 | v1.12.1 | 2025-11-07 | 2025-12-17 (v1.14 released) | | v1.11 | 2025-03-18 | v1.11.0 | 2025-09-18 | 2025-09-18 (end of 6mo support) | | v1.10 | 2024-11-07 | v1.10.1 | 2025-05-07 | 2025-05-07 (v1.12 released) | diff --git a/docs/design.md b/docs/design.md index b35b845b8b3..9a7e409a34d 100644 --- a/docs/design.md +++ b/docs/design.md @@ -118,7 +118,11 @@ and/or creating their own custom CPU templates. #### Clocksources available to guests -Firecracker only exposes kvm-clock to customers. +Firecracker exposes the following clock sources to guests: + +- x86_64: kvm-clock and tsc. Linux guests >=5.10 will pick tsc by default if + stable. +- aarch64: arch_sys_counter ### I/O: Storage, Networking and Rate Limiting diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md index 6e1ac4d4c35..b1b485de5f8 100644 --- a/docs/snapshotting/snapshot-support.md +++ b/docs/snapshotting/snapshot-support.md @@ -492,6 +492,11 @@ resumed with the guest OS wall-clock continuing from the moment of the snapshot creation. For this reason, the wall-clock should be updated to the current time, on the guest-side. More details on how you could do this can be found at a [related FAQ](../../FAQ.md#my-guest-wall-clock-is-drifting-how-can-i-fix-it). +When using `kvm-clock` as clock source on `x86_64`, it's possible to optionally +set the `clock_realtime: true` in the `LoadSnapshot` request to advance the +clock on the guest at restore time (host Linux >= 5.16 is required to support +this feature). Note that this may cause issues within the guest as the clock +will appear to suddenly jump. ## Provisioning host disk space for snapshots diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index dcd6753a4c5..1eb2d83e0f2 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -31,6 +31,9 @@ { "syscall": "mincore" }, + { + "syscall": "pread64" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/src/cpu-template-helper/Cargo.toml b/src/cpu-template-helper/Cargo.toml index f0f585131fd..6d674bb6c1d 100644 --- a/src/cpu-template-helper/Cargo.toml +++ b/src/cpu-template-helper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cpu-template-helper" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 02d89ec4183..68f22554e77 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecracker" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" build = "build.rs" diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 60daaa26639..961fc68e836 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -275,7 +275,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -288,7 +288,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index f98170ccbea..478483e9ad9 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -31,6 +31,7 @@ use super::request::vsock::parse_put_vsock; use crate::api_server::request::hotplug::memory::{ parse_get_memory_hotplug, parse_patch_memory_hotplug, parse_put_memory_hotplug, }; +use crate::api_server::request::memory_info::parse_get_memory; use crate::api_server::request::serial::parse_put_serial; #[derive(Debug)] @@ -91,6 +92,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Get, "hotplug", None) if path_tokens.next() == Some("memory") => { parse_get_memory_hotplug() } + (Method::Get, "memory", None) => parse_get_memory(path_tokens), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), (Method::Put, "balloon", Some(body)) => parse_put_balloon(body), @@ -196,6 +198,9 @@ impl ParsedRequest { &serde_json::json!({ "firecracker_version": version.as_str() }), ), VmmData::FullVmConfig(config) => Self::success_response_with_data(config), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(meminfo) => Self::success_response_with_data(meminfo), + VmmData::MemoryDirty(dirty) => Self::success_response_with_data(dirty), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -610,6 +615,15 @@ pub mod tests { &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, ), + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } + VmmData::Memory(meminfo) => { + http_response(&serde_json::to_string(meminfo).unwrap(), 200) + } + VmmData::MemoryDirty(dirty) => { + http_response(&serde_json::to_string(dirty).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs new file mode 100644 index 00000000000..2d8e55a420e --- /dev/null +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -0,0 +1,19 @@ +use micro_http::Method; +use vmm::rpc_interface::VmmAction; + +use crate::api_server::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory<'a, T>(mut path_tokens: T) -> Result +where + T: Iterator, +{ + match path_tokens.next() { + Some("mappings") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)), + Some("dirty") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryDirty)), + Some(unknown_path) => Err(RequestError::InvalidPathMethod( + format!("/memory/{}", unknown_path), + Method::Get, + )), + None => Ok(ParsedRequest::new_sync(VmmAction::GetMemory)), + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 9be4617bd8e..89472c52d8e 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -11,6 +11,7 @@ pub mod hotplug; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_info; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8284aa66287..0f562b021b5 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -110,6 +110,7 @@ fn parse_put_snapshot_load(body: &Body) -> Result { || snapshot_config.track_dirty_pages, resume_vm: snapshot_config.resume_vm, network_overrides: snapshot_config.network_overrides, + clock_realtime: snapshot_config.clock_realtime, }; // Construct the `ParsedRequest` object. @@ -144,7 +145,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -158,7 +159,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -187,6 +188,7 @@ mod tests { track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -217,6 +219,7 @@ mod tests { track_dirty_pages: true, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -247,6 +250,7 @@ mod tests { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -286,6 +290,7 @@ mod tests { iface_id: String::from("eth0"), host_dev_name: String::from("vmtap2"), }], + clock_realtime: false, }; let mut parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert!( @@ -313,6 +318,7 @@ mod tests { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, }; let parsed_request = parse_put_snapshot(&Body::new(body), Some("load")).unwrap(); assert_eq!( diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 0523dd9b08e..a9c1cc33212 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -5,7 +5,7 @@ info: The API is accessible through HTTP calls on specific URLs carrying JSON modeled data. The transport medium is a Unix Domain Socket. - version: 1.14.1 + version: 1.14.4 termsOfService: "" contact: email: "firecracker-maintainers@amazon.com" @@ -786,6 +786,50 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory/dirty: + get: + summary: Gets the dirty guest memory + description: This returns the resident memory that has been written since last snapshot. + operationId: getDirtyMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/DirtyMemory" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -1347,6 +1391,72 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + + DirtyMemory: + type: object + description: Response containing the bitmap (one bit per page) of dirty pages of guest memory + required: + - bitmap + properties: + bitmap: + type: array + description: The dirty bitmap as a vector of u64 values. Each bit represents if the page is dirty. + items: + type: integer + format: uint64 + Logger: type: object description: @@ -1555,12 +1665,14 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: type: string - description: Path to the file that will contain the guest memory. + description: + Path to the file that will contain the guest memory. It is optional. + In case that a user doesn't provide a path, they are responsible to + ensure they store the microVM's memory state via external means. snapshot_path: type: string description: Path to the file that will contain the microVM state. @@ -1631,6 +1743,14 @@ definitions: description: Network host device names to override items: $ref: "#/definitions/NetworkOverride" + clock_realtime: + type: boolean + description: + "[x86_64 only] When set to true, passes KVM_CLOCK_REALTIME to + KVM_SET_CLOCK on restore, advancing kvmclock by the wall-clock time + elapsed since the snapshot was taken. When false (default), kvmclock resumes + from where it was at snapshot time. This option may be extended to other clock + sources and CPU architectures in the future." TokenBucket: diff --git a/src/jailer/Cargo.toml b/src/jailer/Cargo.toml index 6a72c64a885..03a3aeceb10 100644 --- a/src/jailer/Cargo.toml +++ b/src/jailer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jailer" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" description = "Process for starting Firecracker in production scenarios; applies a cgroup/namespace isolation barrier and then drops privileges." diff --git a/src/rebase-snap/Cargo.toml b/src/rebase-snap/Cargo.toml index 8f6cee0f895..eb96d56ec16 100644 --- a/src/rebase-snap/Cargo.toml +++ b/src/rebase-snap/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rebase-snap" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/seccompiler/Cargo.toml b/src/seccompiler/Cargo.toml index bdd3832a8ea..d83f7b73d2f 100644 --- a/src/seccompiler/Cargo.toml +++ b/src/seccompiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "seccompiler" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" description = "Program that compiles multi-threaded seccomp-bpf filters expressed as JSON into raw BPF programs, serializing them and outputting them to a file." diff --git a/src/snapshot-editor/Cargo.toml b/src/snapshot-editor/Cargo.toml index bd1f93926e3..eae727c2030 100644 --- a/src/snapshot-editor/Cargo.toml +++ b/src/snapshot-editor/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snapshot-editor" -version = "1.14.1" +version = "1.14.4" authors = ["Amazon Firecracker team "] edition = "2024" license = "Apache-2.0" diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index b6ab412a862..8aa23f3037c 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,7 +17,7 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } arrayvec = { version = "0.7.6", optional = true } -aws-lc-rs = { version = "1.15.1", features = ["bindgen"] } +aws-lc-rs = "1.16.2" base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.10.0" @@ -47,7 +47,11 @@ serde_json = "1.0.145" slab = "0.4.11" thiserror = "2.0.17" timerfd = "1.5.0" -userfaultfd = "0.9.0" +userfaultfd = { git = "https://github.com/e2b-dev/userfaultfd-rs", branch = "feat_write_protection", features = [ + "linux5_7", + "linux5_13", + "linux6_7" +] } utils = { path = "../utils" } uuid = "1.18.1" vhost = { version = "0.15.0", features = ["vhost-user-frontend"] } diff --git a/src/vmm/src/arch/aarch64/cache_info.rs b/src/vmm/src/arch/aarch64/cache_info.rs index 8f8611fe440..4c934626f1b 100644 --- a/src/vmm/src/arch/aarch64/cache_info.rs +++ b/src/vmm/src/arch/aarch64/cache_info.rs @@ -10,7 +10,7 @@ use crate::logger::warn; const MAX_CACHE_LEVEL: u8 = 7; #[derive(Debug, thiserror::Error, displaydoc::Display)] -pub(crate) enum CacheInfoError { +pub enum CacheInfoError { /// Failed to read cache information: {0} FailedToReadCacheInfo(#[from] io::Error), /// Invalid cache configuration found for {0}: {1} @@ -32,7 +32,7 @@ trait CacheStore: std::fmt::Debug { } #[derive(Debug)] -pub(crate) struct CacheEntry { +pub struct CacheEntry { // Cache Level: 1, 2, 3.. pub level: u8, // Type of cache: Unified, Data, Instruction. @@ -154,7 +154,7 @@ impl Default for CacheEntry { #[derive(Debug)] // Based on https://elixir.free-electrons.com/linux/v4.9.62/source/include/linux/cacheinfo.h#L11. -pub(crate) enum CacheType { +pub enum CacheType { Instruction, Data, Unified, @@ -314,6 +314,105 @@ pub(crate) fn read_cache_config( Ok(()) } +// CLIDR_EL1 field positions +// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CLIDR-EL1--Cache-Level-ID-Register +const CLIDR_CTYPE_SHIFT: u8 = 3; // Each Ctype field is 3 bits +const CLIDR_LOC_SHIFT: u8 = 24; + +// CLIDR_EL1 Ctype field values +const CLIDR_CTYPE_NO_CACHE: u64 = 0; +const CLIDR_CTYPE_INSTRUCTION: u64 = 1; +const CLIDR_CTYPE_DATA: u64 = 2; +const CLIDR_CTYPE_SEPARATE: u64 = 3; +const CLIDR_CTYPE_UNIFIED: u64 = 4; + +/// Classify a set of cache entries at the same level into a CLIDR Ctype value. +fn ctype_for_entries<'a>(entries: impl Iterator) -> u64 { + let (mut has_data, mut has_inst, mut has_unified) = (false, false, false); + let mut any = false; + for c in entries { + any = true; + match c.type_ { + CacheType::Data => has_data = true, + CacheType::Instruction => has_inst = true, + CacheType::Unified => has_unified = true, + } + } + if !any { + return CLIDR_CTYPE_NO_CACHE; + } + if has_unified { + CLIDR_CTYPE_UNIFIED + } else if has_data && has_inst { + CLIDR_CTYPE_SEPARATE + } else if has_data { + CLIDR_CTYPE_DATA + } else if has_inst { + CLIDR_CTYPE_INSTRUCTION + } else { + CLIDR_CTYPE_NO_CACHE + } +} + +/// Build a CLIDR_EL1 value from the host's cache topology read from sysfs. +/// +/// Since host kernel 6.3 (commit 7af0c2534f4c), KVM fabricates CLIDR_EL1 to +/// expose a different cache topology than the host. Guest kernels >= 6.1.156 +/// backported `init_of_cache_level()` which counts cache leaves from the DT, +/// while `populate_cache_leaves()` uses CLIDR_EL1. If the DT (built from +/// sysfs) describes different cache entries than CLIDR_EL1, the mismatch +/// causes cache sysfs entries to not be created in the guest. +/// +/// This function builds a CLIDR_EL1 value that matches the host's real cache +/// topology so it can be written to each vCPU, making CLIDR_EL1 consistent +/// with the FDT. +pub(crate) fn build_clidr_from_caches( + l1_caches: &[CacheEntry], + non_l1_caches: &[CacheEntry], +) -> u64 { + let mut clidr: u64 = 0; + let mut max_level: u8 = 0; + + let l1_ctype = ctype_for_entries(l1_caches.iter()); + if l1_ctype != CLIDR_CTYPE_NO_CACHE { + clidr |= l1_ctype; + max_level = 1; + } + + for level in 2..=MAX_CACHE_LEVEL { + let ctype = ctype_for_entries(non_l1_caches.iter().filter(|c| c.level == level)); + if ctype == CLIDR_CTYPE_NO_CACHE { + break; + } + + let shift = CLIDR_CTYPE_SHIFT * (level - 1); + clidr |= ctype << shift; + max_level = level; + } + + // Set LoC (Level of Coherence) to the highest cache level + clidr |= u64::from(max_level) << CLIDR_LOC_SHIFT; + + clidr +} + +/// Merge sysfs-derived ctype/LoC fields into an existing CLIDR_EL1 value, +/// preserving LoUU, LoUIS, ICB, and Ttype fields from the original. +/// +/// This ensures that on pre-6.3 kernels (where CLIDR already matches sysfs), +/// the write is effectively a no-op, and fields we can't derive from sysfs +/// (like LoUU, LoUIS, ICB) are never clobbered. +pub(crate) fn merge_clidr(current: u64, sysfs: u64) -> u64 { + // Ctype fields: bits [20:0] (7 levels × 3 bits each = 21 bits) + // LoC field: bits [26:24] + // We replace only these fields from sysfs, preserving LoUIS [23:21], + // LoUU [29:27], ICB [32:30], and Ttype [46:33] from the original. + const CTYPE_MASK: u64 = 0x001F_FFFF; // bits [20:0] + const LOC_MASK: u64 = 0x0700_0000; // bits [26:24] + const REPLACE_MASK: u64 = CTYPE_MASK | LOC_MASK; + (current & !REPLACE_MASK) | (sysfs & REPLACE_MASK) +} + #[cfg(test)] mod tests { use std::collections::HashMap; @@ -576,4 +675,101 @@ mod tests { assert_eq!(l1_caches.len(), 2); assert_eq!(l1_caches.len(), 2); } + + #[test] + fn test_build_clidr_from_caches() { + // L1 Separate (Data + Instruction) + L2 Unified + L3 Unified + let l1 = vec![ + CacheEntry { + level: 1, + type_: CacheType::Data, + ..CacheEntry::default() + }, + CacheEntry { + level: 1, + type_: CacheType::Instruction, + ..CacheEntry::default() + }, + ]; + let non_l1 = vec![ + CacheEntry { + level: 2, + type_: CacheType::Unified, + ..CacheEntry::default() + }, + CacheEntry { + level: 3, + type_: CacheType::Unified, + ..CacheEntry::default() + }, + ]; + let clidr = build_clidr_from_caches(&l1, &non_l1); + // ctype1=3 (Separate), ctype2=4 (Unified), ctype3=4 (Unified), LoC=3 + assert_eq!(clidr & 0x7, 3, "L1 should be Separate"); + assert_eq!((clidr >> 3) & 0x7, 4, "L2 should be Unified"); + assert_eq!((clidr >> 6) & 0x7, 4, "L3 should be Unified"); + assert_eq!((clidr >> 24) & 0x7, 3, "LoC should be 3"); + + // L1 Unified only (no higher levels) + let l1_unified = vec![CacheEntry { + level: 1, + type_: CacheType::Unified, + ..CacheEntry::default() + }]; + let clidr = build_clidr_from_caches(&l1_unified, &[]); + assert_eq!(clidr & 0x7, 4, "L1 should be Unified"); + assert_eq!((clidr >> 3) & 0x7, 0, "L2 should be NoCache"); + assert_eq!((clidr >> 24) & 0x7, 1, "LoC should be 1"); + + // No caches at all + let clidr = build_clidr_from_caches(&[], &[]); + assert_eq!(clidr, 0, "Empty caches should produce CLIDR=0"); + + // Mock store default: L1 Data + L1 Instruction + L2 Unified + let mut l1_mock: Vec = Vec::new(); + let mut non_l1_mock: Vec = Vec::new(); + read_cache_config(&mut l1_mock, &mut non_l1_mock).unwrap(); + let clidr = build_clidr_from_caches(&l1_mock, &non_l1_mock); + assert_eq!(clidr & 0x7, 3, "Mock L1 should be Separate"); + assert_eq!((clidr >> 3) & 0x7, 4, "Mock L2 should be Unified"); + assert_eq!((clidr >> 24) & 0x7, 2, "Mock LoC should be 2"); + } + + #[test] + fn test_merge_clidr() { + // CLIDR_EL1 layout: + // [20:0] Ctype1..Ctype7 (7 × 3 bits) + // [23:21] LoUIS + // [26:24] LoC + // [29:27] LoUU + // [32:30] ICB + // [46:33] Ttype1..Ttype7 + // + // merge_clidr replaces only Ctype [20:0] and LoC [26:24] from sysfs, + // preserving LoUIS, LoUU, ICB, and Ttype from current. + + // current: LoUU=2 [29:27], LoUIS=1 [23:21], ICB=1 [32:30] + // Ctype1=Unified(4) [2:0], LoC=1 [26:24] + let current: u64 = (1 << 30) // ICB=1 + | (2 << 27) // LoUU=2 + | (1 << 24) // LoC=1 + | (1 << 21) // LoUIS=1 + | 4; // Ctype1=Unified + // sysfs: Ctype1=Separate(3), Ctype2=Unified(4), Ctype3=Unified(4), LoC=3 + let sysfs: u64 = (3 << 24) | (4 << 6) | (4 << 3) | 3; + let merged = merge_clidr(current, sysfs); + + // Ctype and LoC should come from sysfs + assert_eq!(merged & 0x001F_FFFF, sysfs & 0x001F_FFFF, "Ctype mismatch"); + assert_eq!((merged >> 24) & 0x7, 3, "LoC should be 3 from sysfs"); + // LoUIS, LoUU, ICB should be preserved from current + assert_eq!((merged >> 21) & 0x7, 1, "LoUIS should be preserved"); + assert_eq!((merged >> 27) & 0x7, 2, "LoUU should be preserved"); + assert_eq!((merged >> 30) & 0x7, 1, "ICB should be preserved"); + + // When current == sysfs in the replaced region, merge is identity + let current = 0x0000_0000_0300_0123_u64; + let sysfs = 0x0000_0000_0300_0123_u64; + assert_eq!(merge_clidr(current, sysfs), current); + } } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs index ee4ecafba1e..8f898a09301 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -80,7 +80,7 @@ pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { } /// ITS registers that we save/restore during snapshot -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct ItsRegisterState { iidr: u64, cbaser: u64, diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 3df0d4642d7..914fdf45d76 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -57,9 +57,13 @@ pub fn restore_state( icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always - // `true`. - state.its_state.as_ref().unwrap().restore(its_device) + // `its_state` is `None` when loading a snapshot created by an older Firecracker version that + // did not save ITS state. In that case, skip ITS restore and leave the ITS in its reset + // state; the guest kernel will re-initialize it. + if let Some(its_state) = &state.its_state { + its_state.restore(its_device)?; + } + Ok(()) } #[cfg(test)] diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index 9bfabee1fea..0fe0aa899b3 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -8,7 +8,8 @@ mod regs; use gicv2::GICv2; use gicv3::GICv3; use kvm_ioctls::{DeviceFd, VmFd}; -pub use regs::GicState; +pub use regs::{GicRegState, GicState, GicVcpuState, VgicSysRegsState}; +pub use gicv3::regs::its_regs::ItsRegisterState; use super::layout; diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 1afa7acde9c..d05b4568904 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -12,20 +12,23 @@ use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; -#[derive(Debug, Serialize, Deserialize)] +/// Serializable state for a block of GIC registers. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct GicRegState { pub(crate) chunks: Vec, } /// Structure for serializing the state of the Vgic ICC regs -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct VgicSysRegsState { + /// Main ICC system registers. pub main_icc_regs: Vec>, + /// AP ICC system registers (one entry per priority group). pub ap_icc_regs: Vec>>, } /// Structure used for serializing the state of the GIC registers. -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct GicState { /// The state of the distributor registers. pub dist: Vec>, @@ -36,9 +39,11 @@ pub struct GicState { } /// Structure used for serializing the state of the GIC registers for a specific vCPU. -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct GicVcpuState { + /// Redistributor registers for this vCPU. pub rdist: Vec>, + /// ICC (CPU interface) system registers for this vCPU. pub icc: VgicSysRegsState, } diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 4e82a7d3d56..e300499799c 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -23,11 +23,13 @@ use std::fs::File; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::{GuestMemoryError, GuestMemoryRegion}; +use zerocopy::IntoBytes; use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; +use crate::logger::warn; use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ @@ -51,6 +53,8 @@ pub enum ConfigurationError { VcpuConfig(#[from] CpuConfigurationError), /// Error configuring the vcpu: {0} VcpuConfigure(#[from] KvmVcpuError), + /// Failed to read host cache information: {0} + CacheInfo(#[from] cache_info::CacheInfoError), } /// Returns a Vec of the valid memory addresses for aarch64. @@ -118,6 +122,11 @@ pub fn configure_system_for_boot( &optional_capabilities, )?; } + + // Override CLIDR_EL1 ctype/LoC fields on each vCPU to match the host's + // real cache topology. See `override_clidr` for details. + override_clidr(vcpus)?; + let vcpu_mpidr = vcpus .iter_mut() .map(|cpu| cpu.kvm_vcpu.get_mpidr()) @@ -142,6 +151,70 @@ pub fn configure_system_for_boot( Ok(()) } +/// Override CLIDR_EL1 ctype/LoC fields on each vCPU to match the host's real +/// cache topology. +/// +/// Since host kernel 6.3 (commit 7af0c2534f4c), KVM fabricates CLIDR_EL1 +/// instead of passing through the host's real value. This can cause the guest +/// to see fewer cache levels than actually exist. Guest kernels >= 6.1.156 +/// backported `init_of_cache_level()` which counts cache leaves from the DT, +/// while `populate_cache_leaves()` uses CLIDR_EL1. If the DT (built from host +/// sysfs) describes different cache entries than CLIDR_EL1, the mismatch +/// causes cache sysfs entries to not be created. +/// +/// We read the current (possibly fabricated) CLIDR_EL1, replace only the ctype +/// and LoC fields with values derived from sysfs, and preserve all other fields +/// (LoUU, LoUIS, ICB, Ttype). This is safe on pre-6.3 kernels where CLIDR +/// already matches sysfs — the write is skipped as a no-op. +fn override_clidr(vcpus: &[Vcpu]) -> Result<(), ConfigurationError> { + let mut l1_caches = Vec::new(); + let mut non_l1_caches = Vec::new(); + cache_info::read_cache_config(&mut l1_caches, &mut non_l1_caches)?; + + // If sysfs reports no L1 caches, we cannot build a meaningful CLIDR. + // Writing an all-zero CLIDR would tell the guest there are no caches, + // which is worse than whatever KVM fabricated. Leave it alone. + if l1_caches.is_empty() { + warn!("No L1 caches found in sysfs, skipping CLIDR override"); + return Ok(()); + } + + let sysfs_clidr = cache_info::build_clidr_from_caches(&l1_caches, &non_l1_caches); + + let mut cur_clidr: u64 = 0; + // Reading/writing CLIDR_EL1 via KVM_SET_ONE_REG may not be supported on + // older kernels (pre-6.3). In that case KVM passes through the real host + // CLIDR and the override is unnecessary, so we warn and continue. + if let Err(e) = vcpus[0] + .kvm_vcpu + .fd + .get_one_reg(regs::CLIDR_EL1, cur_clidr.as_mut_bytes()) + { + warn!("Failed to read CLIDR_EL1, skipping override: {e}"); + return Ok(()); + } + + let new_clidr = cache_info::merge_clidr(cur_clidr, sysfs_clidr); + + if new_clidr != cur_clidr { + for vcpu in vcpus.iter() { + if let Err(e) = vcpu + .kvm_vcpu + .fd + .set_one_reg(regs::CLIDR_EL1, new_clidr.as_bytes()) + { + warn!( + "Failed to set CLIDR_EL1 to {:#x} on vCPU {}, skipping override: {e}", + new_clidr, vcpu.kvm_vcpu.index + ); + return Ok(()); + } + } + } + + Ok(()) +} + /// Returns the memory address where the kernel could be loaded. pub fn get_kernel_start() -> u64 { layout::SYSTEM_MEM_START + layout::SYSTEM_MEM_SIZE diff --git a/src/vmm/src/arch/aarch64/regs.rs b/src/vmm/src/arch/aarch64/regs.rs index 7a24337e5c0..abcde5914b8 100644 --- a/src/vmm/src/arch/aarch64/regs.rs +++ b/src/vmm/src/arch/aarch64/regs.rs @@ -90,6 +90,10 @@ arm64_sys_reg!(ID_AA64ISAR0_EL1, 3, 0, 0, 6, 0); arm64_sys_reg!(ID_AA64ISAR1_EL1, 3, 0, 0, 6, 1); arm64_sys_reg!(ID_AA64MMFR2_EL1, 3, 0, 0, 7, 2); +// Cache Level ID Register +// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CLIDR-EL1--Cache-Level-ID-Register +arm64_sys_reg!(CLIDR_EL1, 3, 1, 0, 0, 1); + // Counter-timer Virtual Timer CompareValue register. // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CNTV-CVAL-EL0--Counter-timer-Virtual-Timer-CompareValue-register // https://elixir.bootlin.com/linux/v6.8/source/arch/arm64/include/asm/sysreg.h#L468 diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 37d97d8c212..92a48252e78 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -5,7 +5,7 @@ use std::fmt; use std::sync::{Arc, Mutex}; use kvm_bindings::{ - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, + KVM_CLOCK_REALTIME, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, }; use kvm_ioctls::Cap; @@ -30,6 +30,8 @@ pub enum ArchVmError { SetPit2(kvm_ioctls::Error), /// Set clock error: {0} SetClock(kvm_ioctls::Error), + /// clock_realtime requested but not present in the snapshot state + ClockRealtimeNotInState, /// Set IrqChipPicMaster error: {0} SetIrqChipPicMaster(kvm_ioctls::Error), /// Set IrqChipPicSlave error: {0} @@ -127,13 +129,25 @@ impl ArchVm { /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. /// - [`kvm_ioctls::VmFd::set_irqchip`] errors. - pub fn restore_state(&mut self, state: &VmState) -> Result<(), ArchVmError> { + pub fn restore_state( + &mut self, + state: &VmState, + clock_realtime: bool, + ) -> Result<(), ArchVmError> { self.fd() .set_pit2(&state.pitstate) .map_err(ArchVmError::SetPit2)?; - self.fd() - .set_clock(&state.clock) - .map_err(ArchVmError::SetClock)?; + let mut clock = state.clock; + clock.flags = if clock_realtime { + // clock_realtime needs to be present in the snapshot + if clock.flags & KVM_CLOCK_REALTIME == 0 { + return Err(ArchVmError::ClockRealtimeNotInState); + } + KVM_CLOCK_REALTIME + } else { + 0 + }; + self.fd().set_clock(&clock).map_err(ArchVmError::SetClock)?; self.fd() .set_irqchip(&state.pic_master) .map_err(ArchVmError::SetIrqChipPicMaster)?; @@ -167,9 +181,7 @@ impl ArchVm { pub fn save_state(&self) -> Result { let pitstate = self.fd().get_pit2().map_err(ArchVmError::VmGetPit2)?; - let mut clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?; - // This bit is not accepted in SET_CLOCK, clear it. - clock.flags &= !KVM_CLOCK_TSC_STABLE; + let clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?; let mut pic_master = kvm_irqchip { chip_id: KVM_IRQCHIP_PIC_MASTER, @@ -224,13 +236,18 @@ pub struct VmState { pub memory: GuestMemoryState, /// resource allocator pub resource_allocator: ResourceAllocator, - pitstate: kvm_pit_state2, - clock: kvm_clock_data, + /// KVM interrupt timer + pub pitstate: kvm_pit_state2, + /// KVM clock data + pub clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_master: kvm_irqchip, + /// Master PIC controller + pub pic_master: kvm_irqchip, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_slave: kvm_irqchip, - ioapic: kvm_irqchip, + /// Slave PIC controller + pub pic_slave: kvm_irqchip, + /// IOAPIC + pub ioapic: kvm_irqchip, } impl fmt::Debug for VmState { @@ -247,11 +264,15 @@ impl fmt::Debug for VmState { #[cfg(test)] mod tests { + use std::time::SystemTime; + use kvm_bindings::{ - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, + KVM_CLOCK_REALTIME, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, }; + use kvm_ioctls::Cap; + use crate::arch::ArchVmError; use crate::snapshot::Snapshot; use crate::vstate::vm::VmState; use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory}; @@ -271,7 +292,6 @@ mod tests { vm_state.pitstate.flags | KVM_PIT_SPEAKER_DUMMY, KVM_PIT_SPEAKER_DUMMY ); - assert_eq!(vm_state.clock.flags & KVM_CLOCK_TSC_STABLE, 0); assert_eq!(vm_state.pic_master.chip_id, KVM_IRQCHIP_PIC_MASTER); assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE); assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC); @@ -279,7 +299,46 @@ mod tests { let (_, mut vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); - vm.restore_state(&vm_state).unwrap(); + vm.restore_state(&vm_state, false).unwrap(); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_vm_save_restore_state_kvm_clock_realtime() { + let (kvm, vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let clock_realtime_supported = + kvm.fd.check_extension_int(Cap::AdjustClock).cast_unsigned() & KVM_CLOCK_REALTIME != 0; + + // mock a state without realtime information + let mut vm_state = vm.save_state().unwrap(); + vm_state.clock.flags &= !KVM_CLOCK_REALTIME; + + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let res = vm.restore_state(&vm_state, true); + assert!(res == Err(ArchVmError::ClockRealtimeNotInState)); + + // mock a state with realtime information + vm_state.clock.flags |= KVM_CLOCK_REALTIME; + vm_state.clock.realtime = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_nanos() + .try_into() + .unwrap(); + + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + let res = vm.restore_state(&vm_state, true); + if clock_realtime_supported { + res.unwrap() + } else { + assert!(matches!(res, Err(ArchVmError::SetClock(err)) if err.errno() == libc::EINVAL)) + } } #[cfg(target_arch = "x86_64")] @@ -297,18 +356,18 @@ mod tests { // Try to restore an invalid PIC Master chip ID let orig_master_chip_id = vm_state.pic_master.chip_id; vm_state.pic_master.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); vm_state.pic_master.chip_id = orig_master_chip_id; // Try to restore an invalid PIC Slave chip ID let orig_slave_chip_id = vm_state.pic_slave.chip_id; vm_state.pic_slave.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); vm_state.pic_slave.chip_id = orig_slave_chip_id; // Try to restore an invalid IOPIC chip ID vm_state.ioapic.chip_id = KVM_NR_IRQCHIPS; - vm.restore_state(&vm_state).unwrap_err(); + vm.restore_state(&vm_state, false).unwrap_err(); } #[cfg(target_arch = "x86_64")] @@ -326,6 +385,6 @@ mod tests { .unwrap() .data; - vm.restore_state(&restored_state).unwrap(); + vm.restore_state(&restored_state, false).unwrap(); } } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 332b1ac3cc3..15be948861d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -318,6 +318,7 @@ pub fn build_microvm_for_boot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; let vmm = Arc::new(Mutex::new(vmm)); @@ -423,6 +424,8 @@ pub enum BuildMicrovmFromSnapshotError { SeccompFiltersInternal(#[from] crate::seccomp::InstallationError), /// Failed to restore devices: {0} RestoreDevices(#[from] DevicePersistError), + /// clock_realtime is not supported on aarch64. + UnsupportedClockRealtime, } /// Builds and starts a microVM based on the provided MicrovmState. @@ -438,6 +441,7 @@ pub fn build_microvm_from_snapshot( uffd: Option, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, + clock_realtime: bool, ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); @@ -479,6 +483,9 @@ pub fn build_microvm_from_snapshot( #[cfg(target_arch = "aarch64")] { + if clock_realtime { + return Err(BuildMicrovmFromSnapshotError::UnsupportedClockRealtime); + } let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. vm.restore_state(&mpidrs, µvm_state.vm_state)?; @@ -486,7 +493,7 @@ pub fn build_microvm_from_snapshot( // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state, clock_realtime)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; @@ -518,6 +525,7 @@ pub fn build_microvm_from_snapshot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. @@ -751,6 +759,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::arch::host_page_size; use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::generated::virtio_ids; @@ -836,6 +845,7 @@ pub(crate) mod tests { vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager: default_device_manager(), + page_size: host_page_size(), } } diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index fc245e05539..2a556b342b1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -35,7 +35,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::resources::VmResources; use crate::snapshot::Persist; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; use crate::vstate::bus::BusError; use crate::vstate::memory::GuestMemoryMmap; use crate::{EmulateSerialInitError, EventManager, Vm}; @@ -125,7 +125,7 @@ impl DeviceManager { output: Option<&PathBuf>, ) -> Result>, std::io::Error> { let (serial_in, serial_out) = match output { - Some(path) => (None, open_file_write_nonblock(path).map(SerialOut::File)?), + Some(path) => (None, open_file_nonblock(path).map(SerialOut::File)?), None => { Self::set_stdout_nonblocking(); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..8e815bf8b6c 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -168,9 +168,9 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { - vmgenid: VMGenIDState, + pub vmgenid: VMGenIDState, #[cfg(target_arch = "x86_64")] - vmclock: VmClockState, + pub vmclock: VmClockState, } impl<'a> Persist<'a> for ACPIDeviceManager { diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 8eba26ac41d..4e8c62922e6 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -1,6 +1,6 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod generated; +pub mod generated; pub mod vmclock; pub mod vmgenid; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..56aee6e44d4 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -22,7 +22,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 4d83075fa0f..411b84bc7be 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -42,6 +42,16 @@ use crate::{impl_device_type, mem_size_mib}; const SIZE_OF_U32: usize = std::mem::size_of::(); const SIZE_OF_STAT: usize = std::mem::size_of::(); +/// Upper bound on the number of stats tags a guest may report. +/// The VirtIO spec currently defines 16, but newer kernel versions can +/// add more (e.g. Linux 6.12 added several, see 74c025c5d7e4). We use a +/// generous limit that still bounds computation without breaking on future +/// kernels. +const MAX_STATS_TAGS: u32 = 256; +/// Maximum valid stats descriptor length in bytes. +/// Descriptors exceeding this are rejected to prevent unbounded iteration. +#[allow(clippy::cast_possible_truncation)] +const MAX_STATS_DESC_LEN: u32 = MAX_STATS_TAGS * std::mem::size_of::() as u32; fn mib_to_pages(amount_mib: u32) -> Result { amount_mib @@ -68,7 +78,7 @@ unsafe impl ByteValued for ConfigSpace {} /// Holds state of the free page hinting run #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] -pub(crate) struct HintingState { +pub struct HintingState { /// The command requested by us. Set to STOP by default. pub host_cmd: u32, /// The last command supplied by guest. @@ -491,7 +501,24 @@ impl Balloon { // the protocol, but return it if we find one. error!("balloon: driver is not compliant, more than one stats buffer received"); self.queues[STATS_INDEX].add_used(prev_stats_desc, 0)?; + self.queues[STATS_INDEX].advance_used_ring_idx(); + self.signal_used_queue(STATS_INDEX)?; } + + // Reject oversized descriptors to prevent a guest from causing + // excessive iteration on the VMM event loop. + // We still hold onto the descriptor (via stats_desc_index below) + // so that the stats request/response protocol is preserved and + // trigger_stats_update can return it to the guest later. + if head.len > MAX_STATS_DESC_LEN { + warn!( + "balloon: stats descriptor too large: {} > {}, skipping", + head.len, MAX_STATS_DESC_LEN + ); + self.stats_desc_index = Some(head.index); + continue; + } + for index in (0..head.len).step_by(SIZE_OF_STAT) { // Read the address at position `index`. The only case // in which this fails is if there is overflow, @@ -1952,4 +1979,71 @@ pub(crate) mod tests { assert_eq!(balloon.num_pages(), 0x1122_3344); assert_eq!(balloon.actual_pages(), 0x1234_5678); } + + /// Test that process_stats_queue holds oversized descriptors without + /// updating stats, and updates stats for valid-length ones. + #[test] + fn test_stats_queue_oversized_descriptor_rejected() { + struct TestCase { + desc_len: u32, + stats_updated: bool, + } + + let cases = [ + TestCase { + desc_len: MAX_STATS_DESC_LEN + 1, + stats_updated: false, + }, + TestCase { + desc_len: MAX_STATS_DESC_LEN, + stats_updated: true, + }, + ]; + + let stat_addr: u64 = 0x1000; + + for tc in &cases { + let mut balloon = Balloon::new(0, true, 1, false, false).unwrap(); + let mem = default_mem(); + let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); + balloon.set_queue(INFLATE_INDEX, statsq.create_queue()); + balloon.set_queue(DEFLATE_INDEX, statsq.create_queue()); + balloon.set_queue(STATS_INDEX, statsq.create_queue()); + balloon.activate(mem.clone(), default_interrupt()).unwrap(); + + // Fill the descriptor region with a recognisable stat value. + let n_stats = tc.desc_len as usize / SIZE_OF_STAT; + for i in 0..n_stats { + mem.write_obj::( + BalloonStat { + tag: VIRTIO_BALLOON_S_MEMFREE, + val: 0xBEEF, + }, + GuestAddress(stat_addr + (i * SIZE_OF_STAT) as u64), + ) + .unwrap(); + } + + set_request(&statsq, 0, stat_addr, tc.desc_len, VIRTQ_DESC_F_NEXT); + balloon.queue_events()[STATS_INDEX].write(1).unwrap(); + balloon.process_stats_queue_event().unwrap(); + + // The descriptor should always be held (stats protocol preserved) + // regardless of whether the stats were updated. + assert!( + balloon.stats_desc_index.is_some(), + "desc_len={}: descriptor should be held", + tc.desc_len, + ); + + // Verify stats were only updated for valid descriptors. + assert_eq!( + balloon.latest_stats.free_memory.is_some(), + tc.stats_updated, + "desc_len={}: expected stats_updated={}", + tc.desc_len, + tc.stats_updated, + ); + } + } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 2314a98aa33..f044c99494b 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -31,22 +31,22 @@ pub struct BalloonConfigSpaceState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonStatsState { - swap_in: Option, - swap_out: Option, - major_faults: Option, - minor_faults: Option, - free_memory: Option, - total_memory: Option, - available_memory: Option, - disk_caches: Option, - hugetlb_allocations: Option, - hugetlb_failures: Option, - oom_kill: Option, - alloc_stall: Option, - async_scan: Option, - direct_scan: Option, - async_reclaim: Option, - direct_reclaim: Option, + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, + pub oom_kill: Option, + pub alloc_stall: Option, + pub async_scan: Option, + pub direct_scan: Option, + pub async_reclaim: Option, + pub direct_reclaim: Option, } impl BalloonStatsState { @@ -101,11 +101,11 @@ impl BalloonStatsState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonState { - stats_polling_interval_s: u16, - stats_desc_index: Option, - latest_stats: BalloonStatsState, - config_space: BalloonConfigSpaceState, - hinting_state: HintingState, + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub hinting_state: HintingState, pub virtio_state: VirtioDeviceState, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs index d507fa9577b..230e6caf47b 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs @@ -15,14 +15,14 @@ use crate::snapshot::Persist; /// vhost-user block device state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VhostUserBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - socket_path: String, - vu_acked_protocol_features: u64, - config_space: Vec, - virtio_state: VirtioDeviceState, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, } impl Persist<'_> for VhostUserBlock { diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 380fe1de0e8..98f17c258ad 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -53,14 +53,14 @@ impl From for FileEngineType { /// Holds info about the block device. Gets saved in snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - disk_path: String, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, - file_engine_type: FileEngineTypeState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, } impl Persist<'_> for VirtioBlock { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index ba56cc39aac..1af7a2cc081 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,18 +30,29 @@ pub struct NetConfigSpaceState { guest_mac: Option, } +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct RxBufferState { + // Number of iovecs we have parsed from the guest + parsed_descriptor_chains_nr: u16, + // Number of used descriptors + used_descriptors: u16, + // Number of used bytes + used_bytes: u32, +} + /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NetState { pub id: String, pub tap_if_name: String, - rx_rate_limiter_state: RateLimiterState, - tx_rate_limiter_state: RateLimiterState, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, /// The associated MMDS network stack. pub mmds_ns: Option, - config_space: NetConfigSpaceState, + pub config_space: NetConfigSpaceState, pub virtio_state: VirtioDeviceState, + pub rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -84,6 +95,7 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), + rx_buffers_state: RxBufferState::default(), } } @@ -128,6 +140,10 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; + if state.virtio_state.activated { + net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; + } + Ok(net) } } diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 85c4940f305..4306b60961b 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -194,13 +194,13 @@ impl VirtioDeviceState { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MmioTransportState { // The register where feature bits are stored. - features_select: u32, + pub features_select: u32, // The register where features page is selected. - acked_features_select: u32, - queue_select: u32, - device_status: u32, - config_generation: u32, - interrupt_status: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, + pub interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 2f9efd80909..f32171db06a 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -28,6 +28,14 @@ use crate::vstate::memory::GuestMemoryMmap; pub const ENTROPY_DEV_ID: &str = "rng"; +/// Maximum number of bytes `handle_one()` will serve per request. +/// +/// Overlapping descriptors within a single chain can cause `buffer.len()` to +/// exceed the amount of distinct guest memory actually backing the request. +/// Capping the per-request allocation to 64 KiB keeps host memory usage +/// bounded regardless of how the descriptor chain is constructed. +const MAX_ENTROPY_BYTES: u32 = 64 * 1024; + #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum EntropyError { /// Error while handling an Event file descriptor: {0} @@ -119,14 +127,19 @@ impl Entropy { return Ok(0); } - let mut rand_bytes = vec![0; self.buffer.len() as usize]; + // Cap the number of bytes we actually generate so that the host-side + // allocation stays bounded even when buffer.len() is inflated by + // overlapping descriptors in the chain. + let len = std::cmp::min(self.buffer.len(), MAX_ENTROPY_BYTES); + + let mut rand_bytes = vec![0; len as usize]; rand::fill(&mut rand_bytes).inspect_err(|_| { METRICS.host_rng_fails.inc(); })?; - // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0. + // It is ok to unwrap here. We are writing `len` bytes at offset 0. self.buffer.write_all_volatile_at(&rand_bytes, 0).unwrap(); - Ok(self.buffer.len()) + Ok(len) } fn process_entropy_queue(&mut self) -> Result<(), InvalidAvailIdx> { @@ -611,4 +624,125 @@ mod tests { // The rate limiter event should have processed the pending buffer as well assert_eq!(METRICS.entropy_bytes.count(), entropy_bytes + 128); } + + /// Verify that handle_one() caps the host allocation to MAX_ENTROPY_BYTES + /// when overlapping descriptors inflate buffer.len() beyond the limit. + #[test] + fn test_handle_one_caps_overlapping_descriptors() { + use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + // 32 descriptors × 4 KiB = 128 KiB claimed, which exceeds MAX_ENTROPY_BYTES (64 KiB). + const N_DESC: u16 = 32; + const CHUNK: u32 = 4096; + + let mem = single_region_mem(0x20000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + let target: u64 = 0x10000; + for i in 0..N_DESC { + let flags = VIRTQ_DESC_F_WRITE | if i < N_DESC - 1 { VIRTQ_DESC_F_NEXT } else { 0 }; + vq.dtable[i as usize].set(target, CHUNK, flags, i + 1); + } + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + // buffer.len() is inflated well past the cap. + assert_eq!(buf.len(), u32::from(N_DESC) * CHUNK); // 128 KiB + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, + MAX_ENTROPY_BYTES, + "handle_one() must cap at MAX_ENTROPY_BYTES ({MAX_ENTROPY_BYTES}), got {bytes} for \ + inflated buffer.len() = {}", + u32::from(N_DESC) * CHUNK + ); + } + + /// Verify that handle_one() caps a large inflated buffer (~4 GiB from + /// 255 overlapping descriptors) to MAX_ENTROPY_BYTES. + #[test] + fn test_handle_one_caps_large_inflated_buffer() { + use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + const N_DESC: u16 = 255; + const CHUNK: u32 = 16 * 1024 * 1024; // 16 MiB + const TOTAL: u64 = (N_DESC as u64) * (CHUNK as u64); // ~4 GiB + + let mem = single_region_mem((CHUNK as usize) + 0x100000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + let target: u64 = 0x80000; + for i in 0..N_DESC { + let flags = VIRTQ_DESC_F_WRITE | if i < N_DESC - 1 { VIRTQ_DESC_F_NEXT } else { 0 }; + vq.dtable[i as usize].set(target, CHUNK, flags, i + 1); + } + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(buf.len() as u64, TOTAL); + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, MAX_ENTROPY_BYTES, + "handle_one() must cap at MAX_ENTROPY_BYTES, not allocate {} bytes", + TOTAL + ); + } + + /// Verify that a request within MAX_ENTROPY_BYTES is served in full + /// (the cap does not truncate legitimate small requests). + #[test] + fn test_handle_one_serves_small_request_in_full() { + use crate::devices::virtio::test_utils::VirtQueue; + use crate::test_utils::single_region_mem; + use crate::vstate::memory::GuestAddress; + + const SIZE: u32 = 256; + + let mem = single_region_mem(0x20000); + let vq = VirtQueue::new(GuestAddress(0), &mem, 256); + let mut queue = vq.create_queue(); + + vq.dtable[0].set(0x10000, SIZE, VIRTQ_DESC_F_WRITE, 0); + vq.avail.ring[0].set(0); + vq.avail.idx.set(1); + + let head = queue.pop().unwrap().unwrap(); + // SAFETY: `mem` is a valid guest memory region and `head` is a descriptor chain + // obtained from the virtqueue backed by that memory. + let buf = unsafe { IoVecBufferMut::<256>::from_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(buf.len(), SIZE); + + let mut dev = default_entropy(); + dev.buffer = buf; + let bytes = dev.handle_one().unwrap(); + + assert_eq!( + bytes, SIZE, + "small request ({SIZE} bytes) should be served in full, got {bytes}" + ); + } } diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 27df145eb81..e841af4926b 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -20,7 +20,7 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, + pub rate_limiter_state: RateLimiterState, } #[derive(Debug)] diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index d98dd4ce365..8dfe17eb5b5 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -168,23 +168,51 @@ impl MmioTransport { #[allow(unused_assignments)] fn set_device_status(&mut self, status: u32) { use device_status::*; - // match changed bits - match !self.device_status & status { - ACKNOWLEDGE if self.device_status == INIT => { - self.device_status = status; - } - DRIVER if self.device_status == ACKNOWLEDGE => { - self.device_status = status; + + const VALID_TRANSITIONS: &[(u32, u32)] = &[ + (INIT, ACKNOWLEDGE), + (ACKNOWLEDGE, ACKNOWLEDGE | DRIVER), + (ACKNOWLEDGE | DRIVER, ACKNOWLEDGE | DRIVER | FEATURES_OK), + ( + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ), + ]; + + if (status & FAILED) != 0 { + // TODO: notify backend driver to stop the device + self.device_status |= FAILED; + } else if status == INIT { + { + let mut locked_device = self.device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + let mut device_status = self.device_status; + let reset_result = locked_device.reset(); + match reset_result { + Some((_interrupt_evt, mut _queue_evts)) => {} + None => { + device_status |= FAILED; + } + } + self.device_status = device_status; + } } - FEATURES_OK if self.device_status == (ACKNOWLEDGE | DRIVER) => { - self.device_status = status; + + // If the backend device driver doesn't support reset, + // just leave the device marked as FAILED. + if self.device_status & FAILED == 0 { + self.reset(); } - DRIVER_OK if self.device_status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) => { - self.device_status = status; + } else if VALID_TRANSITIONS + .iter() + .any(|&(from, to)| self.device_status == from && status == to) + { + self.device_status = status; + + // Activate the device when transitioning to DRIVER_OK. + if status == (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK) { let mut locked_device = self.device.lock().expect("Poisoned lock"); - let device_activated = locked_device.is_activated(); - if !device_activated { - // temporary variable needed for borrow checker + if !locked_device.is_activated() { let activate_result = locked_device.activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { @@ -198,38 +226,11 @@ impl MmioTransport { } } } - _ if (status & FAILED) != 0 => { - // TODO: notify backend driver to stop the device - self.device_status |= FAILED; - } - _ if status == 0 => { - { - let mut locked_device = self.device.lock().expect("Poisoned lock"); - if locked_device.is_activated() { - let mut device_status = self.device_status; - let reset_result = locked_device.reset(); - match reset_result { - Some((_interrupt_evt, mut _queue_evts)) => {} - None => { - device_status |= FAILED; - } - } - self.device_status = device_status; - } - } - - // If the backend device driver doesn't support reset, - // just leave the device marked as FAILED. - if self.device_status & FAILED == 0 { - self.reset(); - } - } - _ => { - warn!( - "invalid virtio driver status transition: {:#x} -> {:#x}", - self.device_status, status - ); - } + } else { + warn!( + "invalid virtio driver status transition: {:#x} -> {:#x}", + self.device_status, status + ); } } } @@ -957,14 +958,6 @@ pub(crate) mod tests { | device_status::DRIVER_OK ); assert!(d.locked_device().is_activated()); - - // A write which changes the size of a queue after activation; currently only triggers - // a warning path and have no effect on queue state. - write_le_u32(&mut buf[..], 0); - d.queue_select = 0; - d.write(0x0, 0x44, &buf[..]); - d.read(0x0, 0x44, &mut buf[..]); - assert_eq!(read_le_u32(&buf[..]), 1); } #[test] @@ -1060,6 +1053,61 @@ pub(crate) mod tests { assert!(d.locked_device().is_activated()); } + fn read_device_status(d: &mut MmioTransport) -> u32 { + let mut buf = [0; 4]; + d.read(0x0, 0x70, &mut buf[..]); + read_le_u32(&buf[..]) + } + + #[test] + fn test_device_status_invalid_transitions() { + let m = single_region_mem(0x1000); + let interrupt: Arc = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); + + let mut assert_rejected = |d: &mut MmioTransport, new: u32, expected: u32| { + set_device_status(d, new); + assert_eq!( + read_device_status(d), + expected, + "transition to {new:#x} should be rejected" + ); + }; + + // Skip ACKNOWLEDGE: INIT -> ACKNOWLEDGE | DRIVER + assert_rejected( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER, + device_status::INIT, + ); + // Arbitrary value from INIT + assert_rejected(&mut d, 0x42, device_status::INIT); + + // Advance to ACKNOWLEDGE | DRIVER | FEATURES_OK + set_device_status(&mut d, device_status::ACKNOWLEDGE); + set_device_status(&mut d, device_status::ACKNOWLEDGE | device_status::DRIVER); + set_device_status( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER | device_status::FEATURES_OK, + ); + let expected = + device_status::ACKNOWLEDGE | device_status::DRIVER | device_status::FEATURES_OK; + + // Go back: FEATURES_OK -> DRIVER + assert_rejected( + &mut d, + device_status::ACKNOWLEDGE | device_status::DRIVER, + expected, + ); + // Valid transition FEATURES_OK -> DRIVER_OK but without cumulative bits + assert_rejected(&mut d, device_status::DRIVER_OK, expected); + } + #[test] fn test_bus_device_reset() { let m = single_region_mem(0x1000); @@ -1119,6 +1167,69 @@ pub(crate) mod tests { assert_eq!(dummy_dev.acked_features(), 24); } + #[test] + fn test_queue_config_immutable_after_activation() { + // Verify that writes to queue configuration fields are rejected after the device has been + // activated (DRIVER_OK). These MMIO registers are write-only (reads return 0), so this + // cannot be tested at the integration level via /dev/mem readback. + let mem = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); + let mut dev = MmioTransport::new( + mem, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); + activate_device(&mut dev); + + dev.queue_select = 0; + + // Save the queue state right after activation. + let size_before = dev.locked_device().queues()[0].size; + let ready_before = dev.locked_device().queues()[0].ready; + let desc_before = dev.locked_device().queues()[0].desc_table_address; + let avail_before = dev.locked_device().queues()[0].avail_ring_address; + let used_before = dev.locked_device().queues()[0].used_ring_address; + + // Attempt to poison every queue config register. + let mut buf = [0u8; 4]; + + // QueueNum (0x38) + write_le_u32(&mut buf, 0); + dev.write(0x0, 0x38, &buf); + assert_eq!(dev.locked_device().queues()[0].size, size_before); + + // QueueReady (0x44) + write_le_u32(&mut buf, 0); + dev.write(0x0, 0x44, &buf); + assert_eq!(dev.locked_device().queues()[0].ready, ready_before); + + // QueueDescLow/High (0x80, 0x84) + write_le_u32(&mut buf, 0xDEADBEEF); + dev.write(0x0, 0x80, &buf); + dev.write(0x0, 0x84, &buf); + assert_eq!( + dev.locked_device().queues()[0].desc_table_address, + desc_before + ); + + // QueueAvailLow/High (0x90, 0x94) + dev.write(0x0, 0x90, &buf); + dev.write(0x0, 0x94, &buf); + assert_eq!( + dev.locked_device().queues()[0].avail_ring_address, + avail_before + ); + + // QueueUsedLow/High (0xa0, 0xa4) + dev.write(0x0, 0xa0, &buf); + dev.write(0x0, 0xa4, &buf); + assert_eq!( + dev.locked_device().queues()[0].used_ring_address, + used_before + ); + } + #[test] fn irq_trigger() { let irq_trigger = IrqTrigger::new(); diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 70876d7aefc..b5ee2a2fed4 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -16,7 +16,9 @@ use vm_memory::GuestAddress; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config_offset::*; use crate::devices::virtio::transport::pci::device::VIRTQ_MSI_NO_VECTOR; +use crate::devices::virtio::transport::pci::device_status::*; use crate::logger::warn; pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; @@ -34,28 +36,6 @@ pub struct VirtioPciCommonConfigState { /// Contains the data for reading and writing the common configuration structure of a virtio PCI /// device. -/// -/// * Registers: -/// -/// ** About the whole device. -/// le32 device_feature_select; // 0x00 // read-write -/// le32 device_feature; // 0x04 // read-only for driver -/// le32 driver_feature_select; // 0x08 // read-write -/// le32 driver_feature; // 0x0C // read-write -/// le16 msix_config; // 0x10 // read-write -/// le16 num_queues; // 0x12 // read-only for driver -/// u8 device_status; // 0x14 // read-write (driver_status) -/// u8 config_generation; // 0x15 // read-only for driver -/// -/// ** About a specific virtqueue. -/// le16 queue_select; // 0x16 // read-write -/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. -/// le16 queue_msix_vector; // 0x1A // read-write -/// le16 queue_enable; // 0x1C // read-write (Ready) -/// le16 queue_notify_off; // 0x1E // read-only for driver -/// le64 queue_desc; // 0x20 // read-write -/// le64 queue_avail; // 0x28 // read-write -/// le64 queue_used; // 0x30 // read-write #[derive(Debug)] pub struct VirtioPciCommonConfig { pub driver_status: u8, @@ -115,11 +95,17 @@ impl VirtioPciCommonConfig { } } - pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + pub fn write( + &mut self, + offset: u64, + data: &[u8], + device: Arc>, + device_activated: bool, + ) { assert!(data.len() <= 8); match data.len() { - 1 => self.write_common_config_byte(offset, data[0]), + 1 => self.write_common_config_byte(offset, data[0], device_activated), 2 => self.write_common_config_word( offset, LittleEndian::read_u16(data), @@ -136,8 +122,8 @@ impl VirtioPciCommonConfig { fn read_common_config_byte(&self, offset: u64) -> u8 { // The driver is only allowed to do aligned, properly sized access. match offset { - 0x14 => self.driver_status, - 0x15 => self.config_generation, + DEVICE_STATUS => self.driver_status, + CONFIG_GENERATION => self.config_generation, _ => { warn!("pci: invalid virtio config byte read: 0x{:x}", offset); 0 @@ -145,36 +131,102 @@ impl VirtioPciCommonConfig { } } - fn write_common_config_byte(&mut self, offset: u64, value: u8) { + fn write_common_config_byte(&mut self, offset: u64, value: u8, device_activated: bool) { match offset { - 0x14 => self.driver_status = value, + DEVICE_STATUS => self.set_device_status(value, device_activated), _ => { warn!("pci: invalid virtio config byte write: 0x{:x}", offset); } } } + fn set_device_status(&mut self, status: u8, device_activated: bool) { + /// Enforce the device status state machine per the virtio spec: + /// INIT -> ACKNOWLEDGE -> DRIVER -> FEATURES_OK -> DRIVER_OK + /// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + /// + /// Each step sets exactly one new bit while preserving all previous bits. + const VALID_TRANSITIONS: &[(u8, u8)] = &[ + (INIT, ACKNOWLEDGE), + (ACKNOWLEDGE, ACKNOWLEDGE | DRIVER), + (ACKNOWLEDGE | DRIVER, ACKNOWLEDGE | DRIVER | FEATURES_OK), + ( + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ), + ]; + + if (status & FAILED) != 0 { + // Something went wrong in the guest. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-110001 + // > FAILED (128) + // > Indicates that something went wrong in the guest, and it has given up on the + // > device. + self.driver_status |= FAILED; + } else if status == INIT { + // Reset requested by the driver. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1430001 + // > The device MUST reset when 0 is written to device_status, and present a 0 in + // > device_status once that is done. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1440002 + // > After writing 0 to device_status, the driver MUST wait for a read of device_status + // > to return 0 before reinitializing the device. + // + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-200001 + // > 2.4.1 Device Requirements: Device Reset + // > A device MUST reinitialize device status to 0 after receiving a reset. + // + // Setting INIT (0) here before the actual reset completes in write_bar() may appear + // racy - the driver could read 0 before the device is fully torn down. But concurrent + // access is serialized since VirtioPciDevice is accessed through Arc>. + self.driver_status = INIT; + } else if VALID_TRANSITIONS + .iter() + .any(|&(from, to)| self.driver_status == from && status == to) + { + if !device_activated { + self.driver_status = status; + } else { + // If the device doesn't implement reset(), the device is left activated. + // Re-initialization against a still-live backend device MUST be rejected. + warn!( + "pci: rejecting device status transition {:#x} -> {:#x}: previous reset did \ + not complete successfully and device is still active", + self.driver_status, status + ); + } + } else { + warn!( + "pci: invalid virtio device status transition: {:#x} -> {:#x}", + self.driver_status, status + ); + } + } + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { match offset { - 0x10 => self.msix_config.load(Ordering::Acquire), - 0x12 => queues.len().try_into().unwrap(), // num_queues - 0x16 => self.queue_select, - 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + MSIX_CONFIG => self.msix_config.load(Ordering::Acquire), + NUM_QUEUES => queues.len().try_into().unwrap(), + QUEUE_SELECT => self.queue_select, + QUEUE_SIZE => self.with_queue(queues, |q| q.size).unwrap_or(0), // If `queue_select` points to an invalid queue we should return NO_VECTOR. // Reading from here // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1280005: // // > The device MUST return vector mapped to a given event, (NO_VECTOR if unmapped) on // > read of config_msix_vector/queue_msix_vector. - 0x1a => self + QUEUE_MSIX_VECTOR => self .msix_queues .lock() .unwrap() .get(self.queue_select as usize) .copied() .unwrap_or(VIRTQ_MSI_NO_VECTOR), - 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), - 0x1e => self.queue_select, // notify_off + QUEUE_ENABLE => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + QUEUE_NOTIFY_OFF => self.queue_select, _ => { warn!("pci: invalid virtio register word read: 0x{:x}", offset); 0 @@ -182,9 +234,28 @@ impl VirtioPciCommonConfig { } } + /// Guard queue configuration field writes based on device status. + /// + /// Per the virtio spec, the driver SHALL follow this sequence: + /// INIT -> ACKNOWLEDGE -> DRIVER -> FEATURES_OK -> DRIVER_OK + /// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + /// + /// Queue configuration must only be done between FEATURES_OK and DRIVER_OK. + fn update_queue_field(&mut self, queues: &mut [Queue], f: F) { + let status = self.driver_status; + if status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) { + self.with_queue_mut(queues, f); + } else { + warn!( + "pci: queue config write not allowed in device status {:#x}", + status + ); + } + } + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { match offset { - 0x10 => { + MSIX_CONFIG => { // Make sure that the guest doesn't select an invalid vector. We are offering // `num_queues + 1` vectors (plus one for configuration updates). If an invalid // vector has been selected, we just store the `NO_VECTOR` value. @@ -198,9 +269,9 @@ impl VirtioPciCommonConfig { .store(VIRTQ_MSI_NO_VECTOR, Ordering::Release); } } - 0x16 => self.queue_select = value, - 0x18 => self.with_queue_mut(queues, |q| q.size = value), - 0x1a => { + QUEUE_SELECT => self.queue_select = value, + QUEUE_SIZE => self.update_queue_field(queues, |q| q.size = value), + QUEUE_MSIX_VECTOR => { let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); let nr_vectors = msix_queues.len() + 1; // Make sure that `queue_select` points to a valid queue. If not, we won't do @@ -216,7 +287,7 @@ impl VirtioPciCommonConfig { } } } - 0x1c => self.with_queue_mut(queues, |q| { + QUEUE_ENABLE => self.update_queue_field(queues, |q| { if value != 0 { q.ready = value == 1; } @@ -229,8 +300,8 @@ impl VirtioPciCommonConfig { fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { match offset { - 0x00 => self.device_feature_select, - 0x04 => { + DEVICE_FEATURE_SELECT => self.device_feature_select, + DEVICE_FEATURE => { let locked_device = device.lock().unwrap(); // Only 64 bits of features (2 pages) are defined for now, so limit // device_feature_select to avoid shifting by 64 or more bits. @@ -241,43 +312,43 @@ impl VirtioPciCommonConfig { 0 } } - 0x08 => self.driver_feature_select, - 0x20 => { + DRIVER_FEATURE_SELECT => self.driver_feature_select, + QUEUE_DESC_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.desc_table_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x24 => { + QUEUE_DESC_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.desc_table_address.0 >> 32) as u32 }) .unwrap_or_default() } - 0x28 => { + QUEUE_AVAIL_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.avail_ring_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x2c => { + QUEUE_AVAIL_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.avail_ring_address.0 >> 32) as u32 }) .unwrap_or_default() } - 0x30 => { + QUEUE_USED_LO => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.used_ring_address.0 & 0xffff_ffff) as u32 }) .unwrap_or_default() } - 0x34 => { + QUEUE_USED_HI => { let locked_device = device.lock().unwrap(); self.with_queue(locked_device.queues(), |q| { (q.used_ring_address.0 >> 32) as u32 @@ -308,25 +379,36 @@ impl VirtioPciCommonConfig { let mut locked_device = device.lock().unwrap(); match offset { - 0x00 => self.device_feature_select = value, - 0x08 => self.driver_feature_select = value, - 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), - 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + DEVICE_FEATURE_SELECT => self.device_feature_select = value, + DRIVER_FEATURE_SELECT => self.driver_feature_select = value, + DRIVER_FEATURE => { + // Feature negotiation is only allowed in DRIVER state. + // https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1220001 + if self.driver_status == (ACKNOWLEDGE | DRIVER) { + locked_device.ack_features_by_page(self.driver_feature_select, value); + } else { + warn!( + "pci: feature negotiation not allowed in device state {:#x}", + self.driver_status + ); + } + } + QUEUE_DESC_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.desc_table_address, value) }), - 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_DESC_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.desc_table_address, value) }), - 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.avail_ring_address, value) }), - 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_AVAIL_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.avail_ring_address, value) }), - 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_LO => self.update_queue_field(locked_device.queues_mut(), |q| { lo(&mut q.used_ring_address, value) }), - 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + QUEUE_USED_HI => self.update_queue_field(locked_device.queues_mut(), |q| { hi(&mut q.used_ring_address, value) }), _ => { @@ -355,6 +437,7 @@ mod tests { use super::*; use crate::devices::virtio::transport::mmio::tests::DummyDevice; + use crate::devices::virtio::transport::pci::common_config_offset::*; fn default_device() -> Arc> { Arc::new(Mutex::new(DummyDevice::new())) @@ -385,54 +468,49 @@ mod tests { }; let dev = Arc::new(Mutex::new(DummyDevice::new())); - // Can set all bits of driver_status. - regs.write(0x14, &[0x55], dev.clone()); - let mut read_back = vec![0x00]; - regs.read(0x14, &mut read_back, dev.clone()); - assert_eq!(read_back[0], 0x55); // The config generation register is read only. - regs.write(0x15, &[0xaa], dev.clone()); + regs.write(CONFIG_GENERATION, &[0xaa], dev.clone(), false); let mut read_back = vec![0x00]; - regs.read(0x15, &mut read_back, dev.clone()); + regs.read(CONFIG_GENERATION, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + regs.write(DEVICE_FEATURE, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x04, &mut read_back, dev.clone()); + regs.read(DEVICE_FEATURE, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0u32); // Feature select registers are read/write. - regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + regs.write(DEVICE_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x00, &mut read_back, dev.clone()); + regs.read(DEVICE_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); - regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + regs.write(DRIVER_FEATURE_SELECT, &[1, 2, 3, 4], dev.clone(), false); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x08, &mut read_back, dev.clone()); + regs.read(DRIVER_FEATURE_SELECT, &mut read_back, dev.clone()); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); // 'queue_select' can be read and written. - regs.write(0x16, &[0xaa, 0x55], dev.clone()); + regs.write(QUEUE_SELECT, &[0xaa, 0x55], dev.clone(), false); let mut read_back = vec![0x00, 0x00]; - regs.read(0x16, &mut read_back, dev.clone()); + regs.read(QUEUE_SELECT, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); // Getting the MSI vector when `queue_select` points to an invalid queue should return // NO_VECTOR (0xffff) - regs.read(0x1a, &mut read_back, dev.clone()); + regs.read(QUEUE_MSIX_VECTOR, &mut read_back, dev.clone()); assert_eq!(read_back, [0xff, 0xff]); // Writing the MSI vector of an invalid `queue_select` does not have any effect. - regs.write(0x1a, &[0x12, 0x13], dev.clone()); + regs.write(QUEUE_MSIX_VECTOR, &[0x12, 0x13], dev.clone(), false); assert_eq!(read_back, [0xff, 0xff]); // Valid `queue_select` though should setup the corresponding MSI-X queue. - regs.write(0x16, &[0x1, 0x0], dev.clone()); + regs.write(QUEUE_SELECT, &[0x1, 0x0], dev.clone(), false); assert_eq!(regs.queue_select, 1); - regs.write(0x1a, &[0x1, 0x0], dev.clone()); - regs.read(0x1a, &mut read_back, dev); + regs.write(QUEUE_MSIX_VECTOR, &[0x1, 0x0], dev.clone(), false); + regs.read(QUEUE_MSIX_VECTOR, &mut read_back, dev); assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1); } @@ -447,15 +525,25 @@ mod tests { .unwrap() .set_avail_features(0x0000_1312_0000_1110); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1110); // select second page - config.write(0x0, 1u32.as_slice(), device.clone()); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.write( + DEVICE_FEATURE_SELECT, + 1u32.as_slice(), + device.clone(), + false, + ); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x1312); // Try a third page. It doesn't exist so we should get all 0s - config.write(0x0, 2u32.as_slice(), device.clone()); - config.read(0x04, features.as_mut_slice(), device.clone()); + config.write( + DEVICE_FEATURE_SELECT, + 2u32.as_slice(), + device.clone(), + false, + ); + config.read(DEVICE_FEATURE, features.as_mut_slice(), device.clone()); assert_eq!(features, 0x0); } @@ -468,12 +556,45 @@ mod tests { .unwrap() .set_avail_features(0x0000_1312_0000_1110); + // Feature negotiation requires DRIVER state (ACKNOWLEDGE | DRIVER). + config.set_device_status(ACKNOWLEDGE, false); + config.set_device_status(ACKNOWLEDGE | DRIVER, false); + // ACK some features of the first page - config.write(0x0c, 0x1100u32.as_slice(), device.clone()); + config.write(DRIVER_FEATURE, 0x1100u32.as_slice(), device.clone(), false); assert_eq!(device.lock().unwrap().acked_features(), 0x1100); // ACK some features of the second page - config.write(0x08, 1u32.as_slice(), device.clone()); - config.write(0x0c, 0x0000_1310u32.as_slice(), device.clone()); + config.write( + DRIVER_FEATURE_SELECT, + 1u32.as_slice(), + device.clone(), + false, + ); + config.write( + DRIVER_FEATURE, + 0x0000_1310u32.as_slice(), + device.clone(), + false, + ); + assert_eq!( + device.lock().unwrap().acked_features(), + 0x0000_1310_0000_1100 + ); + + // After FEATURES_OK, further feature writes should be rejected. + config.set_device_status(ACKNOWLEDGE | DRIVER | FEATURES_OK, false); + config.write( + DRIVER_FEATURE_SELECT, + 0u32.as_slice(), + device.clone(), + false, + ); + config.write( + DRIVER_FEATURE, + 0xFFFF_FFFFu32.as_slice(), + device.clone(), + false, + ); assert_eq!( device.lock().unwrap().acked_features(), 0x0000_1310_0000_1100 @@ -486,25 +607,143 @@ mod tests { let mut device = default_device(); let mut num_queues = 0u16; - config.read(0x12, num_queues.as_mut_slice(), device.clone()); + config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); // `num_queues` is read-only - config.write(0x12, 4u16.as_slice(), device.clone()); - config.read(0x12, num_queues.as_mut_slice(), device.clone()); + config.write(NUM_QUEUES, 4u16.as_slice(), device.clone(), false); + config.read(NUM_QUEUES, num_queues.as_mut_slice(), device.clone()); assert_eq!(num_queues, 2); } #[test] fn test_device_status() { let mut config = default_pci_common_config(); - let mut device = default_device(); + let device = default_device(); let mut status = 0u8; - config.read(0x14, status.as_mut_slice(), device.clone()); + // Initial status should be INIT (0) + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); assert_eq!(status, 0); - config.write(0x14, 0x42u8.as_slice(), device.clone()); - config.read(0x14, status.as_mut_slice(), device.clone()); - assert_eq!(status, 0x42); + + // Valid state transitions + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone(), false); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER).as_slice(), + device.clone(), + false, + ); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE | DRIVER); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + false, + ); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK); + + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK).as_slice(), + device.clone(), + false, + ); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + + // Reset should always work + config.write(DEVICE_STATUS, INIT.as_slice(), device.clone(), true); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + } + + #[test] + fn test_device_status_invalid_transitions() { + let mut config = default_pci_common_config(); + let device = default_device(); + + // Helper to attempt a transition and verify it was rejected. + let mut assert_rejected = |config: &mut VirtioPciCommonConfig, new: u8, expected: u8| { + config.write(DEVICE_STATUS, new.as_slice(), device.clone(), false); + let mut s = 0u8; + config.read(DEVICE_STATUS, s.as_mut_slice(), device.clone()); + assert_eq!(s, expected, "transition to {new:#x} should be rejected"); + }; + + // Check the initial state is INIT (0) + let mut status = 0; + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + + // Skip ACKNOWLEDGE: INIT -> ACKNOWLEDGE | DRIVER + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, INIT); + // Arbitrary value from INIT + assert_rejected(&mut config, 0x42, INIT); + + // Advance to ACKNOWLEDGE | DRIVER | FEATURES_OK + config.write(DEVICE_STATUS, ACKNOWLEDGE.as_slice(), device.clone(), false); + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER).as_slice(), + device.clone(), + false, + ); + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + false, + ); + let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; + + // Go back: FEATURES_OK -> DRIVER + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, expected); + // Valid transition FEATURES_OK -> DRIVER_OK but without cumulative bits + assert_rejected(&mut config, DRIVER_OK, expected); + + // Advance to FEATURES_OK + config.write( + DEVICE_STATUS, + (ACKNOWLEDGE | DRIVER | FEATURES_OK).as_slice(), + device.clone(), + false, + ); + let expected = ACKNOWLEDGE | DRIVER | FEATURES_OK; + + // Go back from FEATURES_OK + assert_rejected(&mut config, ACKNOWLEDGE | DRIVER, expected); + } + + #[test] + fn test_device_activated_blocks_transitions() { + let mut config = default_pci_common_config(); + let device = default_device(); + let mut status = 0u8; + + // Simulate a failed reset: driver_status is INIT but device is still activated. + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!(status, INIT); + + // Every transition should be rejected when device_activated is true at INIT. + for &value in &[ + ACKNOWLEDGE, + ACKNOWLEDGE | DRIVER, + ACKNOWLEDGE | DRIVER | FEATURES_OK, + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, + ] { + config.write(DEVICE_STATUS, value.as_slice(), device.clone(), true); + config.read(DEVICE_STATUS, status.as_mut_slice(), device.clone()); + assert_eq!( + status, INIT, + "transition to {value:#x} should be blocked while device is activated" + ); + } } #[test] @@ -516,14 +755,14 @@ mod tests { // Our device has 2 queues, so we should be using 3 vectors in total. // Trying to set a vector bigger than that should fail. Observing the // failure happens through a subsequent read that should return NO_VECTOR. - config.write(0x10, 3u16.as_slice(), device.clone()); - config.read(0x10, vector.as_mut_slice(), device.clone()); + config.write(MSIX_CONFIG, 3u16.as_slice(), device.clone(), false); + config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for i in 0u16..3 { - config.write(0x10, i.as_slice(), device.clone()); - config.read(0x10, vector.as_mut_slice(), device.clone()); + config.write(MSIX_CONFIG, i.as_slice(), device.clone(), false); + config.read(MSIX_CONFIG, vector.as_mut_slice(), device.clone()); assert_eq!(vector, i); } } @@ -536,8 +775,8 @@ mod tests { let mut max_size = [0u16; 2]; for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); - config.read(0x18, len.as_mut_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!( len, device.lock().unwrap().queues()[queue_id as usize].max_size @@ -545,21 +784,42 @@ mod tests { max_size[queue_id as usize] = len; } - config.write(0x16, 2u16.as_slice(), device.clone()); - config.read(0x18, len.as_mut_slice(), device.clone()); + // Before FEATURES_OK is set, the driver should not be able to change the queue size. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone(), false); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); + assert_eq!(len, max_size[queue_id as usize]); + } + + // Verify writing a queue size to a non-existent queue is ignored. + config.write(QUEUE_SELECT, 2u16.as_slice(), device.clone(), false); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, 0); + // Set FEATURES_OK so that the driver can change the queue size. + config.driver_status |= FEATURES_OK; + // Setup size smaller than what is the maximum offered for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); config.write( - 0x18, + QUEUE_SIZE, (max_size[queue_id as usize] - 1).as_slice(), device.clone(), + false, ); - config.read(0x18, len.as_mut_slice(), device.clone()); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); assert_eq!(len, max_size[queue_id as usize] - 1); } + + // Verify writes are rejected after DRIVER_OK is set. + config.driver_status |= DRIVER_OK; + config.write(QUEUE_SELECT, 0u16.as_slice(), device.clone(), false); + config.write(QUEUE_SIZE, 0u16.as_slice(), device.clone(), false); + config.read(QUEUE_SIZE, len.as_mut_slice(), device.clone()); + assert_eq!(len, max_size[0] - 1); } #[test] @@ -573,16 +833,21 @@ mod tests { // failure happens through a subsequent read that should return NO_VECTOR. for queue_id in 0u16..2 { // Select queue - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); - config.write(0x1a, 3u16.as_slice(), device.clone()); - config.read(0x1a, vector.as_mut_slice(), device.clone()); + config.write(QUEUE_MSIX_VECTOR, 3u16.as_slice(), device.clone(), false); + config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, VIRTQ_MSI_NO_VECTOR); // Any of the 3 valid values should work for vector_id in 0u16..3 { - config.write(0x1a, vector_id.as_slice(), device.clone()); - config.read(0x1a, vector.as_mut_slice(), device.clone()); + config.write( + QUEUE_MSIX_VECTOR, + vector_id.as_slice(), + device.clone(), + false, + ); + config.read(QUEUE_MSIX_VECTOR, vector.as_mut_slice(), device.clone()); assert_eq!(vector, vector_id); } } @@ -594,21 +859,42 @@ mod tests { let device = default_device(); let mut enabled = 0u16; + // Initially queue should be disabled for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); + assert_eq!(enabled, 0); + } - // Initially queue should be disabled - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + // Enabling a queue before FEATURES_OK should be ignored. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone(), false); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 0); + } + + // Set FEATURES_OK so that the driver can enable the queue. + config.driver_status |= FEATURES_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 1u16.as_slice(), device.clone(), false); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); + assert_eq!(enabled, 1); - // Enable queue - config.write(0x1c, 1u16.as_slice(), device.clone()); - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + // The driver MUST NOT write a 0 to queue_enable. + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone(), false); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); + } - // According to the specification "The driver MUST NOT write a 0 to queue_enable." - config.write(0x1c, 0u16.as_slice(), device.clone()); - config.read(0x1c, enabled.as_mut_slice(), device.clone()); + // Verify writes are rejected after DRIVER_OK + config.driver_status |= DRIVER_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.write(QUEUE_ENABLE, 0u16.as_slice(), device.clone(), false); + config.read(QUEUE_ENABLE, enabled.as_mut_slice(), device.clone()); assert_eq!(enabled, 1); } } @@ -624,13 +910,13 @@ mod tests { // a field setup by the device and should be read-only for the driver for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); - config.read(0x1e, offset.as_mut_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); // Writing to it should not have any effect - config.write(0x1e, 0x42.as_slice(), device.clone()); - config.read(0x1e, offset.as_mut_slice(), device.clone()); + config.write(QUEUE_NOTIFY_OFF, 0x42.as_slice(), device.clone(), false); + config.read(QUEUE_NOTIFY_OFF, offset.as_mut_slice(), device.clone()); assert_eq!(offset, queue_id); } } @@ -644,8 +930,8 @@ mod tests { let lo32 = (value & 0xffff_ffff) as u32; let hi32 = (value >> 32) as u32; - config.write(offset, lo32.as_slice(), device.clone()); - config.write(offset + 4, hi32.as_slice(), device.clone()); + config.write(offset, lo32.as_slice(), device.clone(), false); + config.write(offset + 4, hi32.as_slice(), device.clone(), false); } fn read_64bit_field( @@ -666,12 +952,24 @@ mod tests { fn test_queue_addresses() { let mut config = default_pci_common_config(); let device = default_device(); - let mut reg64bit = 0; + // Before FEATURES_OK is set, the driver should not be able to change the queue addresses. + config.driver_status = ACKNOWLEDGE | DRIVER; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { + write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); + assert_eq!(read_64bit_field(&mut config, device.clone(), offset), 0); + } + } + + // Set status so queue fields can be modified + config.driver_status |= FEATURES_OK; for queue_id in 0u16..2 { - config.write(0x16, queue_id.as_slice(), device.clone()); + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); - for offset in [0x20, 0x28, 0x30] { + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { write_64bit_field(&mut config, device.clone(), offset, 0x0000_1312_0000_1110); assert_eq!( read_64bit_field(&mut config, device.clone(), offset), @@ -679,6 +977,20 @@ mod tests { ); } } + + // Verify writes are rejected after DRIVER_OK + config.driver_status |= DRIVER_OK; + for queue_id in 0u16..2 { + config.write(QUEUE_SELECT, queue_id.as_slice(), device.clone(), false); + + for offset in [QUEUE_DESC_LO, QUEUE_AVAIL_LO, QUEUE_USED_LO] { + write_64bit_field(&mut config, device.clone(), offset, 0xDEAD_BEEF); + assert_eq!( + read_64bit_field(&mut config, device.clone(), offset), + 0x0000_1312_0000_1110 + ); + } + } } #[test] @@ -697,51 +1009,51 @@ mod tests { device.lock().unwrap().queues_mut()[0].desc_table_address = GuestAddress(0x0000_1312_0000_1110); let mut buffer = [0u8; 8]; - config.read(0x20, &mut buffer[..1], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..2], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..8], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x20, &mut buffer[..4], device.clone()); + config.read(QUEUE_DESC_LO, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x1110); - config.read(0x24, &mut buffer[..4], device.clone()); + config.read(QUEUE_DESC_HI, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x1312); // 32-bit fields config.device_feature_select = 0x42; let mut buffer = [0u8; 8]; - config.read(0, &mut buffer[..1], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..2], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..8], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0, &mut buffer[..4], device.clone()); + config.read(DEVICE_FEATURE_SELECT, &mut buffer[..4], device.clone()); assert_eq!(LittleEndian::read_u32(&buffer[..4]), 0x42); // 16-bit fields let mut buffer = [0u8; 8]; config.queue_select = 0x42; - config.read(0x16, &mut buffer[..1], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..1], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..4], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..4], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..8], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x16, &mut buffer[..2], device.clone()); + config.read(QUEUE_SELECT, &mut buffer[..2], device.clone()); assert_eq!(LittleEndian::read_u16(&buffer[..2]), 0x42); // 8-bit fields let mut buffer = [0u8; 8]; config.driver_status = 0x42; - config.read(0x14, &mut buffer[..2], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..2], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..4], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..4], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..8], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..8], device.clone()); assert_eq!(buffer, [0u8; 8]); - config.read(0x14, &mut buffer[..1], device.clone()); + config.read(DEVICE_STATUS, &mut buffer[..1], device.clone()); assert_eq!(buffer[0], 0x42); } } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index f0cc8bdefc7..5716b321fec 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -34,6 +34,7 @@ use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config::{ VirtioPciCommonConfig, VirtioPciCommonConfigState, }; +use crate::devices::virtio::transport::pci::device_status::*; use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::{debug, error}; use crate::pci::configuration::{PciCapability, PciConfiguration, PciConfigurationState}; @@ -46,13 +47,6 @@ use crate::vstate::interrupts::{InterruptError, MsixVectorGroup}; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::resources::ResourceAllocator; -const DEVICE_INIT: u8 = 0x00; -const DEVICE_ACKNOWLEDGE: u8 = 0x01; -const DEVICE_DRIVER: u8 = 0x02; -const DEVICE_DRIVER_OK: u8 = 0x04; -const DEVICE_FEATURES_OK: u8 = 0x08; -const DEVICE_FAILED: u8 = 0x80; - /// Vector value used to disable MSI for a queue. pub const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; @@ -440,7 +434,7 @@ impl VirtioPciDevice { vectors, )); - let virtio_pci_device = VirtioPciDevice { + let mut virtio_pci_device = VirtioPciDevice { id, pci_device_bdf: state.pci_device_bdf, configuration: pci_config, @@ -468,15 +462,13 @@ impl VirtioPciDevice { } fn is_driver_ready(&self) -> bool { - let ready_bits = - (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + let ready_bits = (ACKNOWLEDGE | DRIVER | DRIVER_OK | FEATURES_OK); self.common_config.driver_status == ready_bits - && self.common_config.driver_status & DEVICE_FAILED == 0 } /// Determines if the driver has requested the device (re)init / reset itself fn is_driver_init(&self) -> bool { - self.common_config.driver_status == DEVICE_INIT + self.common_config.driver_status == INIT } pub fn config_bar_addr(&self) -> u64 { @@ -843,10 +835,12 @@ impl PciDevice for VirtioPciDevice { fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { - self.common_config - .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) - } + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + self.device_activated.load(Ordering::SeqCst), + ), o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { // We don't actually support legacy INT#x interrupts for VirtIO PCI devices warn!("pci: access to unsupported ISR status field"); @@ -896,6 +890,7 @@ impl PciDevice for VirtioPciDevice { { Ok(()) => self.device_activated.store(true, Ordering::SeqCst), Err(err) => { + self.common_config.driver_status |= DEVICE_NEEDS_RESET; error!("Error activating device: {err:?}"); // Section 2.1.2 of the specification states that we need to send a device @@ -927,9 +922,21 @@ impl PciDevice for VirtioPciDevice { } None => { error!("Attempt to reset device when not implemented in underlying device"); - // TODO: currently we don't support device resetting, but we still - // follow the spec and set the status field to 0. - self.common_config.driver_status = DEVICE_INIT; + // The virtio spec does not specify what to do if reset fails. + // + // Our MMIO transport sets FAILED in this case, but we must NOT do that for PCI. + // During shutdown, the Linux kernel issues a reset to each virtio device. The + // virtio PCI driver then polls device_status until it reads back 0, unlike the + // virtio MMIO driver which simply writes 0 and returns. Setting FAILED would + // cause the poll to spin forever, breaking reboot command and Ctrl-Alt-Del. + // - PCI: https://elixir.bootlin.com/linux/v6.19.8/source/drivers/virtio/virtio_pci_modern.c#L546-L565 + // - MMIO: https://elixir.bootlin.com/linux/v6.19.8/source/drivers/virtio/virtio_mmio.c#L251-L258 + // + // Since device_status was already set to INIT by set_device_status(), we don't + // need to set it again here. However, the backend device is still active since + // reset() is unimplemented. The combination of device_activated == true and + // device_status == INIT will cause set_device_status() to block any + // re-initialization attempts. } } } @@ -949,6 +956,7 @@ impl BusDevice for VirtioPciDevice { #[cfg(test)] mod tests { + use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use event_manager::MutEventSubscriber; @@ -960,16 +968,19 @@ mod tests { use crate::arch::MEM_64BIT_DEVICES_START; use crate::builder::tests::default_vmm; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::device_status::{ACKNOWLEDGE, DRIVER, DRIVER_OK, FEATURES_OK}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids; use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::common_config_offset::*; use crate::devices::virtio::transport::pci::device::{ COMMON_CONFIG_BAR_OFFSET, COMMON_CONFIG_SIZE, DEVICE_CONFIG_BAR_OFFSET, DEVICE_CONFIG_SIZE, ISR_CONFIG_BAR_OFFSET, ISR_CONFIG_SIZE, NOTIFICATION_BAR_OFFSET, NOTIFICATION_SIZE, NOTIFY_OFF_MULTIPLIER, PciVirtioSubclass, VirtioPciCap, VirtioPciCfgCap, VirtioPciNotifyCap, }; + use crate::devices::virtio::transport::pci::device_status::{ + ACKNOWLEDGE, DRIVER, DRIVER_OK, FEATURES_OK, + }; use crate::pci::PciDevice; use crate::pci::msix::MsixCap; use crate::rate_limiter::RateLimiter; @@ -1371,50 +1382,59 @@ mod tests { let mut locked_virtio_pci_device = device.lock().unwrap(); // Let's read the number of queues of the entropy device - // That information is located at offset 0x12 past the BAR region belonging to the common - // config capability. - let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET).unwrap() + 0x12; + // That information is located at NUM_QUEUES offset past the BAR region belonging to the + // common config capability. + let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + NUM_QUEUES).unwrap(); let len = 2u32; let num_queues = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); assert_eq!(num_queues, 1); - // Let's update the driver features and see if that takes effect - let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET).unwrap() + 0x14; - let len = 1u32; - let device_status = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); - assert_eq!(device_status, 0); + // Use queue_select to test read/write through the PCI Configuration Access Capability. + // This register is freely read-writable with no side effects, making it ideal for testing + // the capability mechanism itself. + let bar_offset = u32::try_from(COMMON_CONFIG_BAR_OFFSET + QUEUE_SELECT).unwrap(); + let len = 2u32; + let val = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); + assert_eq!(val, 0); + cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, len, - 0x42u32.as_slice(), + 0x01u32.as_slice(), ); - let device_status = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); - assert_eq!(device_status, 0x42); + let val = cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len); + assert_eq!(val, 0x01); - // reads with out-of-bounds lengths should return 0s + // Reads with out-of-bounds lengths should return 0s assert_eq!( cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, 8), 0 ); - // writes out-of-bounds lengths should have no effect + // Writes with out-of-bounds lengths should have no effect cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, 8, - 0x84u32.as_slice(), + 0xDEADu32.as_slice(), ); assert_eq!( - cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, 1), - 0x42 + cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len), + val ); - // Make sure that we handle properly from/to a BAR where the access length doesn't match - // what we've set in the capability's length + + // When the capability's length is shorter than pci_cfg_data (4 bytes), only that many + // bytes should be forwarded to the BAR write. Writing 0xDEAD_0000 with length=2 should + // only write the lower 2 bytes (0x0000). cap_pci_cfg_write( &mut locked_virtio_pci_device, bar_offset, - 2, - 0x42u8.as_slice(), + len, + 0xDEAD_0000u32.as_slice(), + ); + assert_eq!( + cap_pci_cfg_read(&mut locked_virtio_pci_device, bar_offset, len), + 0x0000 ); } @@ -1481,28 +1501,44 @@ mod tests { } fn write_driver_status(device: &mut VirtioPciDevice, status: u8) { - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x14, status.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS, + status.as_slice(), + ); } fn read_driver_status(device: &mut VirtioPciDevice) -> u8 { let mut status = 0u8; - device.read_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x14, status.as_mut_slice()); + device.read_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_STATUS, + status.as_mut_slice(), + ); status } fn read_device_features(device: &mut VirtioPciDevice) -> u64 { let mut features_lo = 0u32; - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET, 0u32.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE_SELECT, + 0u32.as_slice(), + ); device.read_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x4, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE, features_lo.as_mut_slice(), ); let mut features_hi = 0u32; - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET, 1u32.as_slice()); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE_SELECT, + 1u32.as_slice(), + ); device.read_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x4, + COMMON_CONFIG_BAR_OFFSET + DEVICE_FEATURE, features_hi.as_mut_slice(), ); @@ -1510,16 +1546,24 @@ mod tests { } fn write_driver_features(device: &mut VirtioPciDevice, features: u64) { - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x8, 0u32.as_slice()); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0xc, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE_SELECT, + 0u32.as_slice(), + ); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE, ((features & 0xffff_ffff) as u32).as_slice(), ); - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x8, 1u32.as_slice()); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0xc, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE_SELECT, + 1u32.as_slice(), + ); + device.write_bar( + 0, + COMMON_CONFIG_BAR_OFFSET + DRIVER_FEATURE, (((features >> 32) & 0xffff_ffff) as u32).as_slice(), ); } @@ -1527,20 +1571,20 @@ mod tests { fn setup_queues(device: &mut VirtioPciDevice) { device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x20, + COMMON_CONFIG_BAR_OFFSET + QUEUE_DESC_LO, 0x8000_0000u64.as_slice(), ); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x28, + COMMON_CONFIG_BAR_OFFSET + QUEUE_AVAIL_LO, 0x8000_1000u64.as_slice(), ); device.write_bar( 0, - COMMON_CONFIG_BAR_OFFSET + 0x30, + COMMON_CONFIG_BAR_OFFSET + QUEUE_USED_LO, 0x8000_2000u64.as_slice(), ); - device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + 0x1c, 1u16.as_slice()); + device.write_bar(0, COMMON_CONFIG_BAR_OFFSET + QUEUE_ENABLE, 1u16.as_slice()); } #[test] @@ -1554,27 +1598,21 @@ mod tests { assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); - write_driver_status( - &mut locked_virtio_pci_device, - ACKNOWLEDGE.try_into().unwrap(), - ); - write_driver_status( - &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER).try_into().unwrap(), - ); + write_driver_status(&mut locked_virtio_pci_device, ACKNOWLEDGE); + write_driver_status(&mut locked_virtio_pci_device, ACKNOWLEDGE | DRIVER); assert!(!locked_virtio_pci_device.is_driver_init()); assert!(!locked_virtio_pci_device.is_driver_ready()); assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); let status = read_driver_status(&mut locked_virtio_pci_device); - assert_eq!(status as u32, ACKNOWLEDGE | DRIVER); + assert_eq!(status, ACKNOWLEDGE | DRIVER); // Entropy device just offers VIRTIO_F_VERSION_1 let offered_features = read_device_features(&mut locked_virtio_pci_device); @@ -1583,26 +1621,24 @@ mod tests { write_driver_features(&mut locked_virtio_pci_device, offered_features); write_driver_status( &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER | FEATURES_OK).try_into().unwrap(), + ACKNOWLEDGE | DRIVER | FEATURES_OK, ); let status = read_driver_status(&mut locked_virtio_pci_device); - assert!((status & u8::try_from(FEATURES_OK).unwrap()) != 0); + assert!((status & FEATURES_OK) != 0); assert!(!locked_virtio_pci_device.is_driver_init()); assert!(!locked_virtio_pci_device.is_driver_ready()); assert!( !locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); setup_queues(&mut locked_virtio_pci_device); write_driver_status( &mut locked_virtio_pci_device, - (ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK) - .try_into() - .unwrap(), + ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK, ); assert!(!locked_virtio_pci_device.is_driver_init()); @@ -1610,7 +1646,73 @@ mod tests { assert!( locked_virtio_pci_device .device_activated - .load(std::sync::atomic::Ordering::SeqCst) + .load(Ordering::SeqCst) ); } + + #[test] + fn test_activate_failure_sets_needs_reset() { + // Verify that DEVICE_NEEDS_RESET is set in driver_status when device activation fails. + use crate::devices::virtio::transport::pci::device_status::DEVICE_NEEDS_RESET; + + let mut vmm = create_vmm_with_virtio_pci_device(); + let device = get_virtio_device(&vmm); + let mut locked = device.lock().unwrap(); + + // Drive through init without setting up queues, so activate() fails. + write_driver_status(&mut locked, ACKNOWLEDGE); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER); + let features = read_device_features(&mut locked); + write_driver_features(&mut locked, features); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK); + // Skip setup_queues() -- queues are not ready, so activate() will fail. + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + + assert!(!locked.device_activated.load(Ordering::SeqCst)); + let status = read_driver_status(&mut locked); + assert_eq!(status & DEVICE_NEEDS_RESET, DEVICE_NEEDS_RESET); + } + + #[test] + fn test_failed_reset_blocks_reinitialization() { + let mut vmm = create_vmm_with_virtio_pci_device(); + let device = get_virtio_device(&vmm); + let mut locked = device.lock().unwrap(); + + // Full initialization sequence. + write_driver_status(&mut locked, ACKNOWLEDGE); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER); + let features = read_device_features(&mut locked); + write_driver_features(&mut locked, features); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK); + setup_queues(&mut locked); + write_driver_status(&mut locked, ACKNOWLEDGE | DRIVER | FEATURES_OK | DRIVER_OK); + assert!(locked.device_activated.load(Ordering::SeqCst)); + + // Write 0 to device_status to request a reset. + // Entropy's reset() returns None (unimplemented), so the reset fails. + write_driver_status(&mut locked, 0); + assert_eq!(read_driver_status(&mut locked), 0); + // device_activated stays true because the backend was not actually reset. + assert!(locked.device_activated.load(Ordering::SeqCst)); + + // Attempt to re-initialize should be rejected because device_activated is + // still true while driver_status is INIT. + write_driver_status(&mut locked, ACKNOWLEDGE); + assert_eq!(read_driver_status(&mut locked), 0); + + // Save state and restore into a new device -- the combination of + // device_activated == true and driver_status == INIT is preserved in the + // snapshot, so the blocking behavior survives restore. + let saved_state = locked.state(); + drop(locked); + + let new_entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + let restored = + VirtioPciDevice::new_from_state("rng".to_string(), &vmm.vm, new_entropy, saved_state) + .unwrap(); + + assert!(restored.device_activated.load(Ordering::SeqCst)); + assert_eq!(restored.common_config.driver_status, 0); + } } diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs index 520b52274b3..fba4b8faaa8 100644 --- a/src/vmm/src/devices/virtio/transport/pci/mod.rs +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -3,3 +3,66 @@ pub mod common_config; pub mod device; + +/// Virtio device status field values +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-110001 +/// +/// These are u8 because the PCI transport's device_status register is 8 bits wide. +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +pub(crate) mod device_status { + pub const INIT: u8 = 0x00; + pub const ACKNOWLEDGE: u8 = 0x01; + pub const DRIVER: u8 = 0x02; + pub const DRIVER_OK: u8 = 0x04; + pub const FEATURES_OK: u8 = 0x08; + pub const DEVICE_NEEDS_RESET: u8 = 0x40; + pub const FAILED: u8 = 0x80; +} + +/// Virtio PCI common configuration register offsets +/// https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +/// ```c +/// struct virtio_pci_common_config { +/// /* About the whole device. */ +/// le32 device_feature_select; /* read-write */ +/// le32 device_feature; /* read-only for driver */ +/// le32 driver_feature_select; /* read-write */ +/// le32 driver_feature; /* read-write */ +/// le16 msix_config; /* read-write */ +/// le16 num_queues; /* read-only for driver */ +/// u8 device_status; /* read-write */ +/// u8 config_generation; /* read-only for driver */ +/// +/// /* About a specific virtqueue. */ +/// le16 queue_select; /* read-write */ +/// le16 queue_size; /* read-write, power of 2, or 0. */ +/// le16 queue_msix_vector; /* read-write */ +/// le16 queue_enable; /* read-write */ +/// le16 queue_notify_off; /* read-only for driver */ +/// le64 queue_desc; /* read-write */ +/// le64 queue_avail; /* read-write */ +/// le64 queue_used; /* read-write */ +/// }; +/// ``` +pub(crate) mod common_config_offset { + pub const DEVICE_FEATURE_SELECT: u64 = 0x00; + pub const DEVICE_FEATURE: u64 = 0x04; + pub const DRIVER_FEATURE_SELECT: u64 = 0x08; + pub const DRIVER_FEATURE: u64 = 0x0c; + pub const MSIX_CONFIG: u64 = 0x10; + pub const NUM_QUEUES: u64 = 0x12; + pub const DEVICE_STATUS: u64 = 0x14; + pub const CONFIG_GENERATION: u64 = 0x15; + + pub const QUEUE_SELECT: u64 = 0x16; + pub const QUEUE_SIZE: u64 = 0x18; + pub const QUEUE_MSIX_VECTOR: u64 = 0x1a; + pub const QUEUE_ENABLE: u64 = 0x1c; + pub const QUEUE_NOTIFY_OFF: u64 = 0x1e; + pub const QUEUE_DESC_LO: u64 = 0x20; + pub const QUEUE_DESC_HI: u64 = 0x24; + pub const QUEUE_AVAIL_LO: u64 = 0x28; + pub const QUEUE_AVAIL_HI: u64 = 0x2c; + pub const QUEUE_USED_LO: u64 = 0x30; + pub const QUEUE_USED_HI: u64 = 0x34; +} diff --git a/src/vmm/src/dumbo/pdu/tcp.rs b/src/vmm/src/dumbo/pdu/tcp.rs index 4ff1da93dd7..38d710229d8 100644 --- a/src/vmm/src/dumbo/pdu/tcp.rs +++ b/src/vmm/src/dumbo/pdu/tcp.rs @@ -257,7 +257,13 @@ impl TcpSegment<'_, T> { } _ => { // Some other option; just skip opt_len bytes in total. - i += b[i + 1] as usize; + // Per RFC 9293 (MUST-7), opt_len includes the kind and + // length bytes so the minimum valid value is 2. + let opt_len = b[i + 1] as usize; + if opt_len < 2 { + return Err(TcpError::MssOption); + } + i += opt_len; continue; } } @@ -812,4 +818,25 @@ mod tests { TcpError::MssRemaining ); } + + #[test] + fn test_invalid_tcp_option_len() { + // Build a minimal segment with header_len = 24 (OPTIONS_OFFSET + 4 bytes of options). + let mut buf = [0u8; 100]; + let header_len: u8 = OPTIONS_OFFSET + 4; + { + let mut seg = TcpSegment::from_bytes_unchecked(buf.as_mut()); + seg.set_header_len_rsvd_ns(header_len, false); + } + // Write an unknown option kind (0xFF) with opt_len = 0 (invalid, < 2). + let opts_start = usize::from(OPTIONS_OFFSET); + buf[opts_start] = 0xFF; + buf[opts_start + 1] = 0; + + let seg = TcpSegment::from_bytes_unchecked(buf.as_ref()); + assert_eq!( + seg.parse_mss_option_unchecked(header_len.into()), + Err(TcpError::MssOption) + ); + } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30273e92c06..b0031931200 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -144,13 +144,15 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::mem::{VIRTIO_MEM_DEV_ID, VirtioMem, VirtioMemError, VirtioMemStatus}; use crate::devices::virtio::net::Net; use crate::logger::{METRICS, MetricsError, error, info, warn}; -use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; +use crate::persist::{GuestRegionUffdMapping, MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; +use crate::utils::usize_to_u64; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use crate::vstate::vm::mincore_bitmap; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -254,6 +256,8 @@ pub enum VmmError { Block(#[from] BlockError), /// Balloon: {0} Balloon(#[from] BalloonError), + /// Pagemap error: {0} + Pagemap(#[from] utils::pagemap::PagemapError), /// Failed to create memory hotplug device: {0} VirtioMem(#[from] VirtioMemError), } @@ -313,6 +317,8 @@ pub struct Vmm { vcpus_exit_evt: EventFd, // Device manager device_manager: DeviceManager, + /// Page size used for backing guest memory + pub page_size: usize, } impl Vmm { @@ -690,6 +696,130 @@ impl Vmm { pub fn vm(&self) -> &Vm { &self.vm } + + /// Get the list of mappings for guest memory + pub fn guest_memory_mappings(&self, page_size: usize) -> Vec { + let mut mappings = vec![]; + let mut offset = 0; + + for region in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let size = region.slice.len(); + #[allow(deprecated)] + mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: region.slice.ptr_guard_mut().as_ptr() as u64, + size, + offset, + page_size, + page_size_kib: page_size, + }); + + offset += usize_to_u64(size); + } + + mappings + } + + /// Get info regarding resident and empty pages for guest memory + pub fn guest_memory_info(&self, page_size: usize) -> Result<(Vec, Vec), VmmError> { + let mut resident = vec![]; + let mut empty = vec![]; + let zero_page = vec![0u8; page_size]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + debug_assert!(mem_slot.slice.len().is_multiple_of(page_size)); + debug_assert!( + (mem_slot.slice.ptr_guard_mut().as_ptr() as usize).is_multiple_of(page_size) + ); + + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + let addr = mem_slot.slice.ptr_guard_mut().as_ptr(); + let mut curr_empty = vec![0u64; nr_pages.div_ceil(64)]; + let curr_resident = mincore_bitmap(addr, mem_slot.slice.len(), page_size)?; + + for page_idx in 0..nr_pages { + if (curr_resident[page_idx / 64] & (1u64 << (page_idx % 64))) == 0 { + continue; + } + + // SAFETY: `addr` points to a memory region that is `nr_pages * page_size` long. + let curr_addr = unsafe { addr.add(page_idx * page_size) }; + + // SAFETY: both addresses are valid and they point to a memory region + // that is (at least) `page_size` long + let ret = unsafe { + libc::memcmp( + curr_addr.cast::(), + zero_page.as_ptr().cast::(), + page_size, + ) + }; + + if ret == 0 { + curr_empty[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + + resident.extend_from_slice(&curr_resident); + empty.extend_from_slice(&curr_empty); + } + + Ok((resident, empty)) + } + + /// Get dirty pages bitmap for guest memory + pub fn get_dirty_memory(&self, page_size: usize) -> Result, VmmError> { + let pagemap = utils::pagemap::PagemapReader::new(page_size)?; + let mut dirty_bitmap = vec![]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let base_addr = mem_slot.slice.ptr_guard_mut().as_ptr() as usize; + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + + // Use mincore_bitmap to get resident pages at guest page size granularity + let resident_bitmap = vstate::vm::mincore_bitmap(base_addr as *mut u8, len, page_size)?; + + // TODO: if we don't support UFFD/async WP, we can completely skip this bit, as the + // UFFD handler already tracks dirty pages through the WriteProtected events. For the + // time being, we always do. + // + // Build dirty bitmap: check pagemap only for pages that mincore reports resident. + // This way we reduce the amount of times we read out of /proc//pagemap. + let mut slot_bitmap = vec![0u64; nr_pages.div_ceil(64)]; + for page_idx in 0..nr_pages { + // Check if page is resident in the bitmap. + // TODO: These operations (add to bitmap, check for presence, etc.) merit their own + // implementation, somewhere within a bitmap type). + let is_resident = (resident_bitmap[page_idx / 64] & (1u64 << (page_idx % 64))) != 0; + if is_resident { + let virt_addr = base_addr + (page_idx * page_size); + if pagemap.is_page_dirty(virt_addr)? { + slot_bitmap[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + } + + dirty_bitmap.extend_from_slice(&slot_bitmap); + } + + Ok(dirty_bitmap) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/logger/logging.rs b/src/vmm/src/logger/logging.rs index 8afdf976ffb..a108fb3474b 100644 --- a/src/vmm/src/logger/logging.rs +++ b/src/vmm/src/logger/logging.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Deserializer, Serialize}; use utils::time::LocalTime; use super::metrics::{IncMetric, METRICS}; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; /// Default level filter for logger matching the swagger specification /// (`src/firecracker/swagger/firecracker.yaml`). @@ -62,7 +62,7 @@ impl Logger { ); if let Some(log_path) = config.log_path { - let file = open_file_write_nonblock(&log_path).map_err(LoggerUpdateError)?; + let file = open_file_nonblock(&log_path).map_err(LoggerUpdateError)?; guard.target = Some(file); }; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist/mod.rs similarity index 91% rename from src/vmm/src/persist.rs rename to src/vmm/src/persist/mod.rs index ba2608070c6..6a728f44a67 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist/mod.rs @@ -11,10 +11,11 @@ use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::path::Path; use std::sync::{Arc, Mutex}; +use std::time::Instant; use semver::Version; use serde::{Deserialize, Serialize}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; use vmm_sys_util::sock_ctrl_msg::ScmSocket; #[cfg(target_arch = "aarch64")] @@ -29,7 +30,7 @@ use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; -use crate::snapshot::Snapshot; +use crate::snapshot::{Snapshot, SnapshotError, SnapshotHdr}; use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -43,6 +44,10 @@ use crate::vstate::vcpu::{VcpuSendEventError, VcpuState}; use crate::vstate::vm::{VmError, VmState}; use crate::{EventManager, Vmm, vstate}; +pub(crate) mod v1_10; +pub(crate) mod v1_12; +pub(crate) mod v1_14; + /// Holds information related to the VM that is not part of VmState. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Eq, Serialize)] pub struct VmInfo { @@ -161,8 +166,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - vmm.vm - .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + if let Some(mem_file_path) = params.mem_file_path.as_ref() { + vmm.vm + .snapshot_memory_to_file(mem_file_path, params.snapshot_type, vmm.page_size)?; + } // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime @@ -427,6 +434,7 @@ pub fn restore_from_snapshot( uffd, seccomp_filters, vm_resources, + params.clock_realtime, ) .map_err(RestoreFromSnapshotError::Build) } @@ -445,10 +453,36 @@ pub enum SnapshotStateFromFileError { fn snapshot_state_from_file( snapshot_path: &Path, ) -> Result { - let mut snapshot_reader = File::open(snapshot_path)?; - let snapshot = Snapshot::load(&mut snapshot_reader)?; + let start = Instant::now(); + + let data = std::fs::read(snapshot_path)?; + let version = SnapshotHdr::load(&mut data.as_slice())?.version; - Ok(snapshot.data) + let mut snapshot_reader = data.as_slice(); + let data = match (version.major, version.minor) { + (8, 0) => Snapshot::load(&mut snapshot_reader)?.data, + (6, 0) => { + let v12_state = Snapshot::::load(&mut snapshot_reader)?; + MicrovmState::try_from(v12_state.data).unwrap() + } + (4, 0) => { + let v10_state = Snapshot::::load(&mut snapshot_reader)?; + let v12_state = v1_12::MicrovmState::from(v10_state.data); + MicrovmState::try_from(v12_state).unwrap() + } + _ => { + return Err(SnapshotStateFromFileError::Load( + SnapshotError::InvalidFormatVersion(version), + )); + } + }; + + info!( + "Loading snapshot file took {} usec", + start.elapsed().as_micros() + ); + + Ok(data) } /// Error type for [`guest_memory_from_file`]. @@ -481,6 +515,8 @@ pub enum GuestMemoryFromUffdError { Create(userfaultfd::Error), /// Failed to register memory address range with the userfaultfd object: {0} Register(userfaultfd::Error), + /// Failed to enable write protection on memory address range with the userfaultfd object: {0} + WriteProtect(userfaultfd::Error), /// Failed to connect to UDS Unix stream: {0} Connect(#[from] std::io::Error), /// Failed to sends file descriptor: {0} @@ -502,7 +538,9 @@ fn guest_memory_from_uffd( // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't // actively change the behavior of UFFD, only passively. Without balloon devices // we never call madvise anyway, so no need to put this into a conditional. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + uffd_builder.require_features( + FeatureFlags::EVENT_REMOVE | FeatureFlags::MISSING_HUGETLBFS | FeatureFlags::WP_ASYNC, + ); let uffd = uffd_builder .close_on_exec(true) @@ -512,8 +550,22 @@ fn guest_memory_from_uffd( .map_err(GuestMemoryFromUffdError::Create)?; for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; + uffd.register_with_mode( + mem_region.as_ptr().cast(), + mem_region.size() as _, + RegisterMode::MISSING | RegisterMode::WRITE_PROTECT, + ) + .map_err(GuestMemoryFromUffdError::Register)?; + + // If memory is backed by huge pages, we can immediately write protect it. + // Otherwise (memory is backed by anonymous memory), write protecting here + // won't have any effect, as the write-protection bit for a bitwill be + // wiped when the first page fault occurs. These cases need to be handled + // directly from the UFFD handler. + if huge_pages.is_hugetlbfs() { + uffd.write_protect(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::WriteProtect)?; + } } send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; diff --git a/src/vmm/src/persist/v1_10/aarch64.rs b/src/vmm/src/persist/v1_10/aarch64.rs new file mode 100644 index 00000000000..c85896a0b32 --- /dev/null +++ b/src/vmm/src/persist/v1_10/aarch64.rs @@ -0,0 +1,39 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; +use super::MMIODeviceInfo; + +// Types that are identical across all versions — canonical definitions in v1_14. +pub use crate::persist::v1_14::DeviceType; + +// Types that are identical in v1.10 and v1.12 — canonical definitions in v1_12. +pub use crate::persist::v1_12::{ + // aarch64 GicState is identical in v1.10 and v1.12 (gains its_state in v1.14) + GicState, + // aarch64 VcpuState is identical in v1.10 and v1.12 (gains pvtime_ipa in v1.14) + VcpuState, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 legacy device info (v1.10 layout: uses v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub gic: GicState, + pub kvm_cap_modifiers: Vec, +} diff --git a/src/vmm/src/persist/v1_10/mod.rs b/src/vmm/src/persist/v1_10/mod.rs new file mode 100644 index 00000000000..f95ce37bdca --- /dev/null +++ b/src/vmm/src/persist/v1_10/mod.rs @@ -0,0 +1,154 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.10 (snapshot format version 4.0.0). +//! +//! Types that are identical to v1.14 are imported from that module (the canonical source). +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are imported +//! from v1.12 (the canonical source for that version pair). +//! Only types that are truly v1.10-specific are defined here. +//! +//! Key differences from v1.12: +//! - `GuestMemoryRegionState` includes an `offset` field (removed in v1.11) +//! - `MMIODeviceInfo` uses `irqs: Vec` (changed to `irq: Option` in v1.11) +//! - `VmState` (both arches) has `kvm_cap_modifiers` instead of `memory` +//! - `MicrovmState` has `memory_state: GuestMemoryState` at the top level (not inside VmState) +//! - x86_64 `VcpuState.xsave` is `kvm_xsave` (changed to `Xsave` in v1.12) +//! - No `KvmState` wrapper struct + +use serde::{Deserialize, Serialize}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Types identical to v1.12 — imported from that module (canonical source) +// ─────────────────────────────────────────────────────────────────── + +use crate::persist::VmInfo; + +pub use super::v1_12::{ + // ACPI device manager state (used in MicrovmState defined below) + ACPIDeviceManagerState, + BalloonState, + // Device inner states (used in Connected* wrappers defined below) + BlockState, + EntropyState, + // MMDS version (used in DeviceStates defined below) + MmdsVersionState, + // Virtio transport state (used in Connected* wrappers defined below) + MmioTransportState, + NetState, + VsockState, +}; + +// ─────────────────────────────────────────────────────────────────── +// MMIO device info (v1.10 uses `irqs: Vec`, changed to `irq: Option` in v1.11) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irqs: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers (use v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.10 layout) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.10: GuestMemoryRegionState has `offset` field) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, + /// File offset into the memory snapshot file (present in v1.10, removed in v1.11) + pub offset: u64, +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.10) +// Note: `memory_state` is at this level (not inside VmState), and there is no `kvm_state`. +// The kvm_cap_modifiers field lives inside VmState. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + pub vm_info: VmInfo, + pub memory_state: GuestMemoryState, + pub vm_state: VmState, + pub vcpu_states: Vec, + pub device_states: DeviceStates, + pub acpi_dev_state: ACPIDeviceManagerState, +} diff --git a/src/vmm/src/persist/v1_10/x86_64.rs b/src/vmm/src/persist/v1_10/x86_64.rs new file mode 100644 index 00000000000..d66d1c36eec --- /dev/null +++ b/src/vmm/src/persist/v1_10/x86_64.rs @@ -0,0 +1,55 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + CpuId, Msrs, kvm_clock_data, kvm_debugregs, kvm_irqchip, kvm_lapic_state, kvm_mp_state, + kvm_pit_state2, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, kvm_xsave, +}; +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, + pub kvm_cap_modifiers: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (x86_64, v1.10) +// xsave is kvm_xsave (not Xsave/FamStructWrapper) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Serialize, Deserialize)] +pub struct VcpuState { + pub cpuid: CpuId, + pub saved_msrs: Vec, + pub debug_regs: kvm_debugregs, + pub lapic: kvm_lapic_state, + pub mp_state: kvm_mp_state, + pub regs: kvm_regs, + pub sregs: kvm_sregs, + pub vcpu_events: kvm_vcpu_events, + pub xcrs: kvm_xcrs, + /// In v1.10, xsave is stored as kvm_xsave (4096-byte opaque blob). + /// In v1.12+, it became Xsave = FamStructWrapper to support Intel AMX. + pub xsave: kvm_xsave, + pub tsc_khz: Option, +} + +impl std::fmt::Debug for VcpuState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VcpuState") + .field("tsc_khz", &self.tsc_khz) + .finish_non_exhaustive() + } +} diff --git a/src/vmm/src/persist/v1_12/aarch64.rs b/src/vmm/src/persist/v1_12/aarch64.rs new file mode 100644 index 00000000000..f57079ea6ae --- /dev/null +++ b/src/vmm/src/persist/v1_12/aarch64.rs @@ -0,0 +1,63 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; +use serde::{Deserialize, Serialize}; + +use super::{GuestMemoryState, MMIODeviceInfo}; + +// Types that are canonical in v1_14 and unchanged through all versions +pub use crate::persist::v1_14::{ + // Legacy device type enum + DeviceType, + // GIC helper types (GicState itself changed — its_state added — so redefined in v1_14) + GicRegState, + GicVcpuState, + // Register vector with custom serde + Aarch64RegisterVec, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 GIC types (identical to v1.10; its_state added in v1.14) +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicState { + pub dist: Vec>, + pub gic_vcpu_states: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.10 = v1.12) +// Canonical definition is here; v1.10 imports from this module. +// Gains `pvtime_ipa` in v1.14. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VcpuState { + pub mp_state: kvm_mp_state, + pub regs: Aarch64RegisterVec, + pub mpidr: u64, + pub kvi: kvm_vcpu_init, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub gic: GicState, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 ConnectedLegacyState uses updated MMIODeviceInfo +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} diff --git a/src/vmm/src/persist/v1_12/mod.rs b/src/vmm/src/persist/v1_12/mod.rs new file mode 100644 index 00000000000..32cf5bd7635 --- /dev/null +++ b/src/vmm/src/persist/v1_12/mod.rs @@ -0,0 +1,436 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.12 (snapshot format version 6.0.0). +//! +//! Types that are structurally identical to v1.14 are imported from that module. +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are defined +//! here as the canonical source; v1.10 imports them from this module. +//! Only types that are truly v1.12-specific are also defined here. +//! +//! Changes from v1.10: +//! - `MMIODeviceInfo`: `irqs: Vec` → `irq: Option` (v1.11) +//! - `GuestMemoryRegionState`: `offset` field removed (v1.11) +//! - `VmState`: memory moved here from `MicrovmState`, `kvm_cap_modifiers` moved to `KvmState` +//! - x86_64 `VcpuState.xsave`: `kvm_xsave` → `Xsave` (v1.12) +//! - `KvmState`: new wrapper for `kvm_cap_modifiers` +//! - `MicrovmState`: adds `kvm_state`, removes `memory_state` + +use serde::{Deserialize, Serialize}; + +use super::v1_10; +#[cfg(target_arch = "x86_64")] +use crate::arch::VcpuState; +use crate::devices::acpi::vmgenid::VMGenIDState; +use crate::devices::virtio::balloon::persist::BalloonConfigSpaceState; +use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::block::virtio::persist::FileEngineTypeState; +use crate::devices::virtio::net::persist::{NetConfigSpaceState, RxBufferState}; +use crate::devices::virtio::persist::QueueState; +use crate::devices::virtio::vsock::persist::VsockBackendState; +use crate::mmds::persist::MmdsNetworkStackState; +use crate::persist::VmInfo; +use crate::rate_limiter::persist::RateLimiterState; +use crate::vstate::kvm::KvmState; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Shared simple types — same in v1.10 and v1.12; differs in v1.14 +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct VirtioDeviceState { + pub device_type: u32, + pub avail_features: u64, + pub acked_features: u64, + pub queues: Vec, + pub interrupt_status: u32, + pub activated: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MmioTransportState { + pub features_select: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, +} + +// ─────────────────────────────────────────────────────────────────── +// Block device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VhostUserBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum BlockState { + Virtio(VirtioBlockState), + VhostUser(VhostUserBlockState), +} + +// ─────────────────────────────────────────────────────────────────── +// Net device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetState { + pub id: String, + pub tap_if_name: String, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, + pub mmds_ns: Option, + pub config_space: NetConfigSpaceState, + pub virtio_state: VirtioDeviceState, + pub rx_buffers_state: RxBufferState, +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockFrontendState { + pub cid: u64, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockState { + pub backend: VsockBackendState, + pub frontend: VsockFrontendState, +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct BalloonStatsState { + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BalloonState { + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub virtio_state: VirtioDeviceState, +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EntropyState { + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS +// ─────────────────────────────────────────────────────────────────── + +/// MMDS version (renamed to `MmdsVersion` and restructured in v1.14). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum MmdsVersionState { + V1, + V2, +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI devices state (same as v1.10; vmgenid becomes mandatory in v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct ACPIDeviceManagerState { + pub vmgenid: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: irqs: Vec → irq: Option +// ─────────────────────────────────────────────────────────────────── + +/// MMIO device info. +/// +/// Note: stored as `Option` in Firecracker source, but `NonZeroU32` has +/// the same bincode wire format as `u32`, so we use `Option` here. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irq: Option, +} + +impl MMIODeviceInfo { + pub(crate) fn from(old: v1_10::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old.addr, + len: old.len, + // v1.10 stored a Vec of IRQs; v1.11+ uses a single optional IRQ. + // In practice exactly one IRQ was always present for devices that have one. + irq: old.irqs.into_iter().next(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: `offset` field removed from GuestMemoryRegionState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, +} + +impl From for GuestMemoryRegionState { + fn from(old: v1_10::GuestMemoryRegionState) -> Self { + // Drop the `offset` field which was removed in v1.11. + GuestMemoryRegionState { + base_address: old.base_address, + size: old.size, + } + } +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +impl From for GuestMemoryState { + fn from(old: v1_10::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old + .regions + .into_iter() + .map(GuestMemoryRegionState::from) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers — redefined because MMIODeviceInfo changed. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +impl From for DeviceStates { + fn from(old: v1_10::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old + .legacy_devices + .into_iter() + .map(|ld| ConnectedLegacyState { + type_: ld.type_, + device_info: MMIODeviceInfo::from(ld.device_info), + }) + .collect(), + block_devices: old + .block_devices + .into_iter() + .map(|d| ConnectedBlockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + net_devices: old + .net_devices + .into_iter() + .map(|d| ConnectedNetState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + vsock_device: old.vsock_device.map(|d| ConnectedVsockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + balloon_device: old.balloon_device.map(|d| ConnectedBalloonState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + mmds_version: old.mmds_version, + entropy_device: old.entropy_device.map(|d| ConnectedEntropyState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.12) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + /// Imported from v1_14; unchanged through all versions. + pub vm_info: VmInfo, + /// Imported from v1_14; wraps `kvm_cap_modifiers`, extracted from v1.10's `VmState`. + pub kvm_state: KvmState, + /// Redefined in v1.12: `memory` moved in from top-level `MicrovmState.memory_state`, + /// `kvm_cap_modifiers` moved out to `KvmState`. Redefined again in v1.14: gains + /// `resource_allocator`; `GuestMemoryRegionState` gains `region_type` and `plugged`. + pub vm_state: VmState, + /// x86_64: redefined here (`xsave` type changed from `kvm_xsave` to `Xsave`); + /// imported into v1.14 (same type). + /// aarch64: canonical definition here (same as v1.10; gains `pvtime_ipa` in v1.14). + pub vcpu_states: Vec, + /// Redefined here: all `ConnectedXxxState` wrappers rebuilt because `MMIODeviceInfo` + /// changed (`irqs: Vec` → `irq: Option`). Inner device states (BlockState, + /// NetState, etc.) are defined in this module as the v1.10/v1.12 canonical source. + pub device_states: DeviceStates, + /// Defined in this module as the v1.10/v1.12 canonical source. Redefined in v1.14: + /// `vmgenid` becomes mandatory, x86_64 gains `vmclock`; moved inside + /// `DevicesState.acpi_state` (no longer top-level). + pub acpi_dev_state: ACPIDeviceManagerState, +} + +impl From for MicrovmState { + fn from(old: v1_10::MicrovmState) -> Self { + // In v1.10, kvm_cap_modifiers lives in VmState; in v1.12 it moves to KvmState. + // KvmCapability is the same type in all versions (imported from v1_14). + let kvm_cap_modifiers = old.vm_state.kvm_cap_modifiers; + + let memory = GuestMemoryState::from(old.memory_state); + + #[cfg(target_arch = "x86_64")] + let vm_state = VmState { + memory, + pitstate: old.vm_state.pitstate, + clock: old.vm_state.clock, + pic_master: old.vm_state.pic_master, + pic_slave: old.vm_state.pic_slave, + ioapic: old.vm_state.ioapic, + }; + + #[cfg(target_arch = "aarch64")] + let vm_state = VmState { + memory, + gic: old.vm_state.gic, + }; + + // x86_64: xsave type changed from kvm_xsave → Xsave, needs conversion. + // aarch64: VcpuState is identical in v1.10 and v1.12 (v1_12 is canonical source). + #[cfg(target_arch = "x86_64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + #[cfg(target_arch = "aarch64")] + let vcpu_states = old.vcpu_states; + + MicrovmState { + vm_info: old.vm_info, + kvm_state: KvmState { kvm_cap_modifiers }, + vm_state, + vcpu_states, + device_states: DeviceStates::from(old.device_states), + acpi_dev_state: old.acpi_dev_state, + } + } +} diff --git a/src/vmm/src/persist/v1_12/x86_64.rs b/src/vmm/src/persist/v1_12/x86_64.rs new file mode 100644 index 00000000000..912bb10b7ab --- /dev/null +++ b/src/vmm/src/persist/v1_12/x86_64.rs @@ -0,0 +1,46 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_clock_data, kvm_irqchip, kvm_pit_state2}; +use serde::{Deserialize, Serialize}; + +use crate::{arch::VcpuState, persist::v1_14::x86_64::xsave_from_v1_10}; + +use super::{GuestMemoryState, v1_10}; + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: xsave type changed from kvm_xsave → Xsave +// VcpuState is defined in v1_14 (same in v1.12 and v1.14); conversion from v1.10 is here. +// ─────────────────────────────────────────────────────────────────── + +impl VcpuState { + pub(crate) fn from(old: v1_10::VcpuState) -> VcpuState { + VcpuState { + cpuid: old.cpuid, + saved_msrs: old.saved_msrs, + debug_regs: old.debug_regs, + lapic: old.lapic, + mp_state: old.mp_state, + regs: old.regs, + sregs: old.sregs, + vcpu_events: old.vcpu_events, + xcrs: old.xcrs, + xsave: xsave_from_v1_10(old.xsave), + tsc_khz: old.tsc_khz, + } + } +} diff --git a/src/vmm/src/persist/v1_14/aarch64.rs b/src/vmm/src/persist/v1_14/aarch64.rs new file mode 100644 index 00000000000..8ac93332eb9 --- /dev/null +++ b/src/vmm/src/persist/v1_14/aarch64.rs @@ -0,0 +1,134 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Serialize}; + +use super::{ACPIDeviceManagerState, ConvertError, GuestMemoryState, MMIODeviceInfo, + ResourceAllocator, irq_to_gsi}; +use crate::devices::acpi::vmgenid::VMGenIDState; +use crate::persist::v1_12; + +// ─────────────────────────────────────────────────────────────────── +// Re-export runtime types — v1.14 snapshot format matches the runtime format. +// These are used by v1.12 (and v1.10 via v1.12) as canonical type definitions. +// ─────────────────────────────────────────────────────────────────── + +pub use crate::arch::aarch64::gic::{GicRegState, GicState, GicVcpuState}; +pub use crate::arch::aarch64::regs::Aarch64RegisterVec; +pub use crate::arch::aarch64::vcpu::VcpuState; +pub use crate::arch::aarch64::vm::VmState; + +// ─────────────────────────────────────────────────────────────────── +// StaticCpuTemplate — aarch64-specific snapshot enum (same in v1.10, v1.12, v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum StaticCpuTemplate { + V1N1, + #[default] + None, +} + +// ─────────────────────────────────────────────────────────────────── +// DeviceType — aarch64 legacy device type enum (snapshot format) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum DeviceType { + Virtio(u32), + Serial, + Rtc, +} + +impl From for crate::arch::DeviceType { + fn from(dt: DeviceType) -> Self { + match dt { + DeviceType::Virtio(n) => crate::arch::DeviceType::Virtio(n), + DeviceType::Serial => crate::arch::DeviceType::Serial, + DeviceType::Rtc => crate::arch::DeviceType::Rtc, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ConnectedLegacyState — convert v1.12 snapshot type to runtime type +// ─────────────────────────────────────────────────────────────────── + +impl From for crate::device_manager::persist::ConnectedLegacyState { + fn from(s: v1_12::ConnectedLegacyState) -> Self { + crate::device_manager::persist::ConnectedLegacyState { + type_: crate::arch::DeviceType::from(s.type_), + device_info: MMIODeviceInfo::from(s.device_info), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// GIC state (aarch64, v1.14: adds its_state) +// GicState is the runtime type (re-exported above); conversion from v1.12 is here. +// ─────────────────────────────────────────────────────────────────── + +impl GicState { + pub(crate) fn from(old_state: v1_12::GicState) -> GicState { + GicState { + dist: old_state.dist, + gic_vcpu_states: old_state.gic_vcpu_states, + its_state: None, // v1.12 had no ITS support + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.14: gains pvtime_ipa) +// VcpuState is the runtime type (re-exported above); conversion from v1.12 is here. +// ─────────────────────────────────────────────────────────────────── + +impl VcpuState { + pub(crate) fn from(old_state: v1_12::VcpuState) -> VcpuState { + VcpuState { + mp_state: old_state.mp_state, + regs: old_state.regs, + mpidr: old_state.mpidr, + kvi: old_state.kvi, + pvtime_ipa: None, // new in v1.14; default to None (not configured) + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state (aarch64: no vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + _resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + Ok(ACPIDeviceManagerState { + vmgenid: VMGenIDState { + // v1.12 aarch64 uses IRQ_BASE=32-based numbers; v1.14 uses 0-based GSIs + gsi: irq_to_gsi(vmgenid.gsi), + addr: vmgenid.addr, + }, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.14: adds resource_allocator) +// VmState is the runtime type (re-exported above); conversion from v1.12 is here. +// ─────────────────────────────────────────────────────────────────── + +impl VmState { + pub(crate) fn from( + old_state: v1_12::VmState, + resource_allocator: ResourceAllocator, + ) -> VmState { + VmState { + memory: GuestMemoryState::from(old_state.memory), + gic: GicState::from(old_state.gic), + resource_allocator, + } + } +} diff --git a/src/vmm/src/persist/v1_14/mod.rs b/src/vmm/src/persist/v1_14/mod.rs new file mode 100644 index 00000000000..eb780bbe6f5 --- /dev/null +++ b/src/vmm/src/persist/v1_14/mod.rs @@ -0,0 +1,597 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.14 (snapshot format version 8.0.0). +//! +//! This module is the **canonical source** for types shared across all snapshot versions. +//! Older modules (v1_12, v1_10) import unchanged types from here rather than defining +//! their own copies. +//! +//! Types that are unique to v1.14 or changed from v1.12: +//! - `VirtioDeviceState`: `interrupt_status` removed (moved to `MmioTransportState`) +//! - `MmioTransportState`: gains `interrupt_status` +//! - `MMIODeviceInfo`: `irq` → `gsi` +//! - `NetState`: `rx_buffers_state` retained +//! - `BalloonStatsState`: 6 new fields +//! - `BalloonState`: gains `hinting_state` +//! - aarch64 `GicState`: gains `its_state` +//! - aarch64 `VcpuState`: gains `pvtime_ipa` +//! - `GuestMemoryRegionState`: gains `region_type` and `plugged` +//! - `ACPIDeviceManagerState`: vmgenid now mandatory, adds vmclock (x86_64) +//! - New types: `ConnectedDeviceState`, `DevicesState`, `ResourceAllocator`, +//! `PmemState`, `VirtioMemState`, `MmdsState`, `GuestRegionType`, etc. + +use vm_allocator::{AddressAllocator, AllocPolicy, IdAllocator}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +use crate::arch::{ + FIRST_ADDR_PAST_64BITS_MMIO, GSI_LEGACY_END, GSI_LEGACY_START, GSI_MSI_END, GSI_MSI_START, + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PAST_64BITS_MMIO_SIZE, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, +}; +#[cfg(target_arch = "x86_64")] +use crate::arch::VmState; +use crate::device_manager::DevicesState; +use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevicesState; +use crate::device_manager::persist::{ + ACPIDeviceManagerState, DeviceStates, MmdsState, VirtioDeviceState as ConnectedDeviceState, +}; +#[cfg(target_arch = "aarch64")] +use crate::device_manager::persist::ConnectedLegacyState; +use crate::devices::acpi::vmgenid::VMGENID_MEM_SIZE; +use crate::devices::virtio::balloon::device::HintingState; +use crate::devices::virtio::balloon::persist::{BalloonState, BalloonStatsState}; +use crate::devices::virtio::block::persist::BlockState; +use crate::devices::virtio::block::vhost_user::persist::VhostUserBlockState; +use crate::devices::virtio::block::virtio::persist::VirtioBlockState; +use crate::devices::virtio::net::persist::NetState; +use crate::devices::virtio::persist::{MmioTransportState, VirtioDeviceState}; +use crate::devices::virtio::rng::persist::EntropyState; +use crate::devices::virtio::vsock::persist::{VsockFrontendState, VsockState}; +use crate::mmds::data_store::MmdsVersion; +use crate::persist::{MicrovmState, v1_12}; +use crate::vstate::memory::{GuestMemoryRegionState, GuestMemoryState, GuestRegionType}; +use crate::vstate::resources::ResourceAllocator; + +#[derive(Debug, thiserror::Error)] +pub enum ConvertError { + #[error("VMGenID state is missing; cannot convert snapshot (v1.12 snapshot must have VMGenID)")] + MissingVmGenId, + #[error("vm-allocator error during ResourceAllocator reconstruction: {0}")] + Allocator(#[from] vm_allocator::Error), + #[error("ResourceAllocator reconstruction failed: duplicate/invalid MMIO address 0x{0:x}")] + DuplicateAddress(u64), + #[error("ResourceAllocator reconstruction failed: GSI {0} out of expected range")] + #[allow(dead_code)] + GsiOutOfRange(u32), +} + +// In v1.12 x86_64, IRQ_BASE = 5 = GSI_LEGACY_START. No conversion needed. +// This constant exists for symmetry with the aarch64 SPI_START offset. +pub const SPI_START: u32 = 0; // no-op offset for x86_64 + +/// Convert a v1.12 IRQ number to a v1.14 GSI number. +/// +/// x86_64: IRQ_BASE (5) == GSI_LEGACY_START (5) — no transformation needed. +/// aarch64: IRQ_BASE (32) != GSI_LEGACY_START (0) — subtract SPI_START (32). +pub(crate) fn irq_to_gsi(irq: u32) -> u32 { + irq.saturating_sub(SPI_START) +} + +impl VirtioDeviceState { + /// Convert v1.12 VirtioDeviceState → v1.14 VirtioDeviceState. + /// + /// With v1.14, the `interrupt_status` moves from [`VirtioDeviceState`] to [`MmioTransportState`]. + /// That's why we don't use `From` here, so we can return + /// `interrupt_status` separately. + pub(crate) fn from(old_state: v1_12::VirtioDeviceState) -> (Self, u32) { + let interrupt_status = old_state.interrupt_status; + let new_state = VirtioDeviceState { + device_type: old_state.device_type, + avail_features: old_state.avail_features, + acked_features: old_state.acked_features, + queues: old_state.queues, // QueueState is the same type (re-exported v1_10 → v1_12 → v1_14) + activated: old_state.activated, + }; + (new_state, interrupt_status) + } +} + +/// Convert v1.12 MmioTransportState → v1.14 MmioTransportState with interrupt_status. +impl MmioTransportState { + pub(crate) fn from(old_state: v1_12::MmioTransportState, interrupt_status: u32) -> Self { + MmioTransportState { + features_select: old_state.features_select, + acked_features_select: old_state.acked_features_select, + queue_select: old_state.queue_select, + device_status: old_state.device_status, + config_generation: old_state.config_generation, + interrupt_status, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.14: irq → gsi +// ─────────────────────────────────────────────────────────────────── +impl MMIODeviceInfo { + /// Convert v1.12 MMIODeviceInfo → v1.14 MMIODeviceInfo. + /// irq (Option, same wire format as Option) → gsi: Option + pub(crate) fn from(old_state: v1_12::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old_state.addr, + len: old_state.len, + gsi: old_state.irq.map(irq_to_gsi), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Block device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl VirtioBlockState { + pub(crate) fn from(old_state: v1_12::VirtioBlockState) -> (VirtioBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VirtioBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + disk_path: old_state.disk_path, + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + file_engine_type: old_state.file_engine_type, + }; + (new, interrupt_status) + } +} + +impl VhostUserBlockState { + pub(crate) fn from(old_state: v1_12::VhostUserBlockState) -> (VhostUserBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VhostUserBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + socket_path: old_state.socket_path, + vu_acked_protocol_features: old_state.vu_acked_protocol_features, + config_space: old_state.config_space, + virtio_state, + }; + (new, interrupt_status) + } +} + +impl BlockState { + pub(crate) fn from(old_state: v1_12::BlockState) -> (BlockState, u32) { + match old_state { + v1_12::BlockState::Virtio(b) => { + let (new, irq) = VirtioBlockState::from(b); + (BlockState::Virtio(new), irq) + } + v1_12::BlockState::VhostUser(b) => { + let (new, irq) = VhostUserBlockState::from(b); + (BlockState::VhostUser(new), irq) + } + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS — MmdsVersionState renamed/restructured to MmdsState +// ─────────────────────────────────────────────────────────────────── +impl MmdsVersion { + pub(crate) fn from(old_state: v1_12::MmdsVersionState) -> MmdsVersion { + match old_state { + v1_12::MmdsVersionState::V1 => MmdsVersion::V1, + v1_12::MmdsVersionState::V2 => MmdsVersion::V2, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Net device — changed: VirtioDeviceState changed; rx_buffers_state retained +// ─────────────────────────────────────────────────────────────────── +impl NetState { + pub(crate) fn from(old_state: v1_12::NetState) -> (NetState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = NetState { + id: old_state.id, + tap_if_name: old_state.tap_if_name, + rx_rate_limiter_state: old_state.rx_rate_limiter_state, + tx_rate_limiter_state: old_state.tx_rate_limiter_state, + mmds_ns: old_state.mmds_ns, + config_space: old_state.config_space, + virtio_state, + rx_buffers_state: old_state.rx_buffers_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device — VsockFrontendState/VsockState redefined (VirtioDeviceState changed) +// VsockUdsState and VsockBackendState are unchanged and defined above +// ─────────────────────────────────────────────────────────────────── +impl VsockState { + pub(crate) fn from(old_state: v1_12::VsockState) -> (VsockState, u32) { + let (virtio_state, interrupt_status) = + VirtioDeviceState::from(old_state.frontend.virtio_state); + let new = VsockState { + backend: old_state.backend, + frontend: VsockFrontendState { + cid: old_state.frontend.cid, + virtio_state, + }, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device — BalloonStatsState gains 6 new fields; BalloonState gains hinting_state +// ─────────────────────────────────────────────────────────────────── +impl BalloonStatsState { + pub(crate) fn from(old_state: v1_12::BalloonStatsState) -> BalloonStatsState { + BalloonStatsState { + swap_in: old_state.swap_in, + swap_out: old_state.swap_out, + major_faults: old_state.major_faults, + minor_faults: old_state.minor_faults, + free_memory: old_state.free_memory, + total_memory: old_state.total_memory, + available_memory: old_state.available_memory, + disk_caches: old_state.disk_caches, + hugetlb_allocations: old_state.hugetlb_allocations, + hugetlb_failures: old_state.hugetlb_failures, + oom_kill: None, + alloc_stall: None, + async_scan: None, + direct_scan: None, + async_reclaim: None, + direct_reclaim: None, + } + } +} + +impl BalloonState { + pub(crate) fn from(old_state: v1_12::BalloonState) -> (BalloonState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = BalloonState { + stats_polling_interval_s: old_state.stats_polling_interval_s, + stats_desc_index: old_state.stats_desc_index, + latest_stats: BalloonStatsState::from(old_state.latest_stats), + config_space: old_state.config_space, + hinting_state: HintingState { + host_cmd: 0, + last_cmd_id: 0, + guest_cmd: None, + // Default: acknowledge on finish (matches firecracker's `default_ack_on_stop()`) + acknowledge_on_finish: true, + }, + virtio_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl EntropyState { + pub(crate) fn from(old_state: v1_12::EntropyState) -> (EntropyState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = EntropyState { + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + }; + (new, interrupt_status) + } +} + +macro_rules! convert_connected_state { + ($old_type:ty, $new_type:ty) => { + impl From<$old_type> for ConnectedDeviceState<$new_type> { + fn from(old_type: $old_type) -> Self { + let (device_state, interrupt_status) = <$new_type>::from(old_type.device_state); + let transport_state = + MmioTransportState::from(old_type.transport_state, interrupt_status); + ConnectedDeviceState { + device_id: old_type.device_id, + device_state, + transport_state, + device_info: MMIODeviceInfo::from(old_type.device_info), + } + } + } + }; +} + +convert_connected_state!(v1_12::ConnectedBlockState, BlockState); +convert_connected_state!(v1_12::ConnectedNetState, NetState); +convert_connected_state!(v1_12::ConnectedVsockState, VsockState); +convert_connected_state!(v1_12::ConnectedBalloonState, BalloonState); +convert_connected_state!(v1_12::ConnectedEntropyState, EntropyState); + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.14 layout) +// ─────────────────────────────────────────────────────────────────── + +impl From for DeviceStates { + fn from(old_state: v1_12::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old_state + .legacy_devices + .into_iter() + .map(ConnectedLegacyState::from) + .collect(), + block_devices: old_state + .block_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + net_devices: old_state + .net_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + vsock_device: old_state + .vsock_device + .map(ConnectedDeviceState::::from), + balloon_device: old_state + .balloon_device + .map(ConnectedDeviceState::::from), + mmds: old_state.mmds_version.map(|v| MmdsState { + version: MmdsVersion::from(v), + imds_compat: false, + }), + entropy_device: old_state + .entropy_device + .map(ConnectedDeviceState::::from), + // pmem and memory devices are new in v1.14, not present in v1.12 + pmem_devices: Vec::new(), + memory_device: None, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.14: region_type and plugged added) +// ─────────────────────────────────────────────────────────────────── +impl From for GuestMemoryState { + fn from(old_state: v1_12::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old_state + .regions + .into_iter() + .map(|r| GuestMemoryRegionState { + base_address: r.base_address, + size: r.size, + // v1.12 snapshots don't have memory hotplug, all regions are Dram + region_type: GuestRegionType::Dram, + // No slots were plugged/unplugged; Dram regions have a single slot + // of size == region size, so there's 1 plugged slot + plugged: vec![true], + }) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ResourceAllocator (new in v1.14) +// ─────────────────────────────────────────────────────────────────── +impl ResourceAllocator { + /// Reconstruct the v1.14 ResourceAllocator from v1.12 device information. + /// + /// In v1.12, the ResourceAllocator state wasn't persisted; in v1.14 it is. + /// We reconstruct it by marking all allocations that were made during VM setup. + pub(crate) fn from( + device_states: &v1_12::DeviceStates, + acpi_state: &v1_12::ACPIDeviceManagerState, + ) -> Result { + // Initialize fresh allocators matching ResourceAllocator::new() + let mut gsi_legacy = + IdAllocator::new(GSI_LEGACY_START, GSI_LEGACY_END).map_err(ConvertError::Allocator)?; + let mut gsi_msi = + IdAllocator::new(GSI_MSI_START, GSI_MSI_END).map_err(ConvertError::Allocator)?; + let mut mmio32 = AddressAllocator::new(MEM_32BIT_DEVICES_START, MEM_32BIT_DEVICES_SIZE) + .map_err(ConvertError::Allocator)?; + + // 64-bit MMIO space + let mmio64_start = MEM_64BIT_DEVICES_START; + let mmio64_size = MEM_64BIT_DEVICES_SIZE; + let mmio64 = + AddressAllocator::new(mmio64_start, mmio64_size).map_err(ConvertError::Allocator)?; + + // Past 64-bit MMIO space + let past_mmio64_start = FIRST_ADDR_PAST_64BITS_MMIO; + let past_mmio64_size = PAST_64BITS_MMIO_SIZE; + let past_mmio64 = AddressAllocator::new(past_mmio64_start, past_mmio64_size) + .map_err(ConvertError::Allocator)?; + + // System memory allocator + let mut system_mem = AddressAllocator::new(SYSTEM_MEM_START, SYSTEM_MEM_SIZE) + .map_err(ConvertError::Allocator)?; + + // Collect all used GSIs and MMIO addresses from devices + let mut used_legacy_gsis: Vec = Vec::new(); + let mut used_msi_gsis: Vec = Vec::new(); + let mut used_mmio32_addrs: Vec<(u64, u64)> = Vec::new(); // (addr, len) + + // Helper to classify and record a device's MMIODeviceInfo. + // On aarch64, v1.12 stores IRQ numbers starting from IRQ_BASE=32 (physical SPI), + // while v1.14 uses 0-based GSI numbers. We convert with irq_to_gsi(). + // Also: only record MMIO addresses within the v1.14 mmio32_memory range + // [MEM_32BIT_DEVICES_START, ...). Addresses below that (serial, RTC, early virtio + // devices allocated from v1.12's single MMIO allocator) are not tracked by the + // v1.14 mmio32_memory allocator and must be skipped. + let mut record_device_info = |info: &v1_12::MMIODeviceInfo| { + if let Some(irq) = info.irq { + let gsi = irq_to_gsi(irq); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } else if (GSI_MSI_START..=GSI_MSI_END).contains(&gsi) { + used_msi_gsis.push(gsi); + } + } + // Only record addresses within the v1.14 mmio32_memory range + if info.addr >= MEM_32BIT_DEVICES_START { + used_mmio32_addrs.push((info.addr, info.len)); + } + }; + + for dev in &device_states.block_devices { + record_device_info(&dev.device_info); + } + for dev in &device_states.net_devices { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.vsock_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.balloon_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.entropy_device { + record_device_info(&dev.device_info); + } + + #[cfg(target_arch = "aarch64")] + for dev in &device_states.legacy_devices { + record_device_info(&dev.device_info); + } + + // Also account for VMGenID's legacy GSI. + // v1.12 stores IRQ_BASE-based values; convert to v1.14 0-based GSI. + if let Some(vmgenid) = &acpi_state.vmgenid { + let gsi = irq_to_gsi(vmgenid.gsi); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } + } + + // Reconstruct legacy GSI allocator + // IdAllocator allocates sequentially. To reconstruct it, we allocate IDs up to + // max(used_ids) and free the ones we didn't use. + if !used_legacy_gsis.is_empty() { + let max_gsi = *used_legacy_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = + used_legacy_gsis.iter().cloned().collect(); + + // Allocate all IDs from start to max + let mut allocated = Vec::new(); + for id in GSI_LEGACY_START..=max_gsi { + let got = gsi_legacy.allocate_id().map_err(ConvertError::Allocator)?; + allocated.push(got); + assert_eq!(got, id, "IdAllocator must allocate sequentially"); + } + // Free the ones not in use + for id in GSI_LEGACY_START..=max_gsi { + if !used_set.contains(&id) { + gsi_legacy.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct MSI GSI allocator (similarly) + if !used_msi_gsis.is_empty() { + let max_gsi = *used_msi_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = used_msi_gsis.iter().cloned().collect(); + + for id in GSI_MSI_START..=max_gsi { + let got = gsi_msi.allocate_id().map_err(ConvertError::Allocator)?; + assert_eq!(got, id); + } + for id in GSI_MSI_START..=max_gsi { + if !used_set.contains(&id) { + gsi_msi.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct 32-bit MMIO allocator + // Each MMIO device was allocated with FirstMatch policy, so they were assigned + // sequentially. We use ExactMatch to mark each address as used. + for (addr, len) in &used_mmio32_addrs { + mmio32 + .allocate(*len, 1, AllocPolicy::ExactMatch(*addr)) + .map_err(|_| ConvertError::DuplicateAddress(*addr))?; + } + + // Reconstruct system memory allocator. + // In v1.12, VMGenID was allocated with LastMatch (highest addr in system_memory). + // VmClock (x86_64 only, new in v1.14) will be allocated in ACPIDeviceManagerState::from + // using LastMatch, which will place it just below the VMGenID region. + // We mark the VMGenID address as used here so the VmClock allocation in + // ACPIDeviceManagerState::from gets the correct (lower) address. + if let Some(vmgenid) = &acpi_state.vmgenid { + system_mem + .allocate(VMGENID_MEM_SIZE, 8, AllocPolicy::ExactMatch(vmgenid.addr)) + .map_err(|_| ConvertError::DuplicateAddress(vmgenid.addr))?; + } + + Ok(ResourceAllocator { + gsi_legacy_allocator: gsi_legacy, + gsi_msi_allocator: gsi_msi, + mmio32_memory: mmio32, + mmio64_memory: mmio64, + past_mmio64_memory: past_mmio64, + system_memory: system_mem, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.14) +// ─────────────────────────────────────────────────────────────────── +impl TryFrom for MicrovmState { + type Error = ConvertError; + + fn try_from(old: v1_12::MicrovmState) -> Result { + // Reconstruct ResourceAllocator from device info + let mut resource_allocator = + ResourceAllocator::from(&old.device_states, &old.acpi_dev_state)?; + + // Convert ACPI state (also allocates VmClock from resource_allocator on x86_64) + let acpi_state = ACPIDeviceManagerState::from(old.acpi_dev_state, &mut resource_allocator)?; + + // Convert device states + let mmio_state = DeviceStates::from(old.device_states); + + let device_states = DevicesState { + mmio_state, + acpi_state, + pci_state: PciDevicesState::default(), + }; + + // Convert VM state (embeds the reconstructed resource allocator) + let vm_state = VmState::from(old.vm_state, resource_allocator); + + // x86_64: VcpuState is the same type in v1.12 and v1.14. + // aarch64: VcpuState gains pvtime_ipa field, needs conversion. + #[cfg(target_arch = "x86_64")] + let vcpu_states = old.vcpu_states; + #[cfg(target_arch = "aarch64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + + Ok(MicrovmState { + vm_info: old.vm_info, + kvm_state: old.kvm_state, + vm_state, + vcpu_states, + device_states, + }) + } +} diff --git a/src/vmm/src/persist/v1_14/x86_64.rs b/src/vmm/src/persist/v1_14/x86_64.rs new file mode 100644 index 00000000000..d772c78016e --- /dev/null +++ b/src/vmm/src/persist/v1_14/x86_64.rs @@ -0,0 +1,93 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::kvm_xsave; +use vm_allocator::AllocPolicy; + +use super::v1_12; +use crate::devices::acpi::generated::vmclock_abi::{ + VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, +}; +use crate::{ + arch::VmState, + devices::acpi::vmclock::{VMCLOCK_SIZE, VmClockState}, + persist::v1_14::ConvertError, +}; + +use super::{ACPIDeviceManagerState, GuestMemoryState, ResourceAllocator}; + +pub use kvm_bindings::Xsave; + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state impl (x86_64: allocates vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + + // Allocate VmClock from system memory using LastMatch (same as VmClock::new()) + // VmClock must be allocated after VMGenID in the system memory allocator reconstruction. + let vmclock_addr = resource_allocator + .system_memory + .allocate( + VMCLOCK_SIZE as u64, + VMCLOCK_SIZE as u64, + AllocPolicy::LastMatch, + ) + .map_err(ConvertError::Allocator)? + .start(); + + let vmclock = VmClockState { + guest_address: vmclock_addr, + inner: vmclock_abi { + magic: VMCLOCK_MAGIC, + size: VMCLOCK_SIZE, + version: 1, + clock_status: VMCLOCK_STATUS_UNKNOWN, + counter_id: VMCLOCK_COUNTER_INVALID, + ..Default::default() + }, + }; + + Ok(ACPIDeviceManagerState { vmgenid, vmclock }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.14: adds resource_allocator) +// ─────────────────────────────────────────────────────────────────── +impl VmState { + pub(crate) fn from(s: v1_12::VmState, resource_allocator: ResourceAllocator) -> VmState { + VmState { + memory: GuestMemoryState::from(s.memory), + resource_allocator, + pitstate: s.pitstate, + clock: s.clock, + pic_master: s.pic_master, + pic_slave: s.pic_slave, + ioapic: s.ioapic, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Helper used by v1_12::VcpuState::from(v1_10::VcpuState) +// ─────────────────────────────────────────────────────────────────── + +/// Convert a v1.10 `kvm_xsave` into a v1.12/v1.14 `Xsave` (= `FamStructWrapper`). +/// +/// v1.12 introduced `Xsave` to support Intel AMX extended save state (extra FAM entries). +/// A snapshot from v1.10 has no AMX state, so `len = 0` (zero FAM entries). +pub(crate) fn xsave_from_v1_10(old: kvm_xsave) -> Xsave { + let mut xsave = Xsave::new(0).expect("failed to allocate Xsave wrapper"); + // SAFETY: We only overwrite the `xsave` sub-field, not `len`, so the + // FamStructWrapper length invariant is preserved. + unsafe { + xsave.as_mut_fam_struct().xsave = old; + } + xsave +} diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index fdd0862a9d4..78f3a254518 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -28,8 +28,9 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; +use crate::vmm_config::meminfo::{MemoryDirty, MemoryMapingsResponse, MemoryResponse}; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -146,6 +147,12 @@ pub enum VmmAction { /// Update the microVM configuration (memory & vcpu) using `VmUpdateConfig` as input. This /// action can only be called before the microVM has booted. UpdateMachineConfiguration(MachineConfigUpdate), + /// Get the guest memory mappings to host memory + GetMemoryMappings, + /// Get guest memory resident and empty pages information + GetMemory, + /// Get guest memory dirty pages information + GetMemoryDirty, } /// Wrapper for all errors associated with VMM actions. @@ -197,6 +204,8 @@ pub enum VmmActionError { OperationNotSupportedPostBoot, /// The requested operation is not supported before starting the microVM. OperationNotSupportedPreBoot, + /// The requested operation is not supported while the microVM is running. + OperationNotSupportedWhileRunning, /// Start microvm error: {0} StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} @@ -228,6 +237,12 @@ pub enum VmmData { VirtioMemStatus(VirtioMemStatus), /// The status of the virtio-balloon hinting run HintingStatus(HintingStatus), + /// The guest memory mapping information. + MemoryMappings(MemoryMapingsResponse), + /// The guest memory resident and empty pages information + Memory(MemoryResponse), + /// The guest memory dirty pages information + MemoryDirty(MemoryDirty), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -495,7 +510,10 @@ impl<'a> PrebootApiController<'a> { | UpdateNetworkInterface(_) | StartFreePageHinting(_) | GetFreePageHintingStatus - | StopFreePageHinting => Err(VmmActionError::OperationNotSupportedPreBoot), + | StopFreePageHinting + | GetMemoryMappings + | GetMemory + | GetMemoryDirty => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -771,6 +789,9 @@ impl RuntimeApiController { .update_memory_hotplug_size(cfg.requested_size_mib) .map(|_| VmmData::Empty) .map_err(VmmActionError::MemoryHotplugUpdate), + GetMemoryMappings => self.get_guest_memory_mappings(), + GetMemory => self.get_guest_memory_info(), + GetMemoryDirty => self.get_dirty_memory_info(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -937,6 +958,57 @@ impl RuntimeApiController { .map_err(NetworkInterfaceError::DeviceUpdate) .map_err(VmmActionError::NetworkConfig) } + + /// Get guest memory mappings + fn get_guest_memory_mappings(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let mappings = vmm.guest_memory_mappings(page_size); + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory mappings' VMM action took {elapsed_time_us} us."); + Ok(VmmData::MemoryMappings(MemoryMapingsResponse { mappings })) + } + + /// Get resident and empty pages information for guest memory + fn get_guest_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let (resident, empty) = vmm.guest_memory_info(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory info' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::Memory(MemoryResponse { resident, empty })) + } + + /// Get dirty pages information for guest memory + fn get_dirty_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let bitmap = vmm.get_dirty_memory(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get dirty memory' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::MemoryDirty(MemoryDirty { bitmap })) + } } #[cfg(test)] @@ -1243,7 +1315,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] @@ -1369,6 +1441,7 @@ mod tests { track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }, ))); check_unsupported(runtime_request(VmmAction::SetEntropyDevice( diff --git a/src/vmm/src/snapshot/mod.rs b/src/vmm/src/snapshot/mod.rs index 76b5203298d..360b823712b 100644 --- a/src/vmm/src/snapshot/mod.rs +++ b/src/vmm/src/snapshot/mod.rs @@ -81,26 +81,21 @@ fn serialize(data: &S, write: &mut W) -> Result<(), Snap /// Firecracker snapshot header #[derive(Debug, Serialize, Deserialize)] -struct SnapshotHdr { +pub struct SnapshotHdr { /// magic value - magic: u64, + pub magic: u64, /// Snapshot data version - version: Version, + pub version: Version, } impl SnapshotHdr { - fn load(buf: &mut &[u8]) -> Result { + pub(crate) fn load(buf: &mut &[u8]) -> Result { let (hdr, bytes_read) = bincode::serde::decode_from_slice::(buf, BINCODE_CONFIG)?; if hdr.magic != SNAPSHOT_MAGIC_ID { return Err(SnapshotError::InvalidMagic(hdr.magic)); } - if hdr.version.major != SNAPSHOT_VERSION.major || hdr.version.minor > SNAPSHOT_VERSION.minor - { - return Err(SnapshotError::InvalidFormatVersion(hdr.version)); - } - *buf = &buf[bytes_read..]; Ok(hdr) diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 1288abef0ba..4179be93fec 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -9,12 +9,13 @@ pub mod net; pub mod signal; /// Module with state machine pub mod sm; +/// Module with pagemap utilities +pub mod pagemap; use std::fs::{File, OpenOptions}; use std::num::Wrapping; use std::os::unix::fs::OpenOptionsExt; use std::path::Path; -use std::result::Result; use libc::O_NONBLOCK; @@ -76,14 +77,16 @@ pub const fn align_down(addr: u64, align: u64) -> u64 { addr & !(align - 1) } -/// Create and open a File for writing to it. -/// In case we open a FIFO, in order to not block the instance if nobody is consuming the message -/// that is flushed to it, we are opening it with `O_NONBLOCK` flag. -/// In this case, writing to a pipe will start failing when reaching 64K of unconsumed content. -pub fn open_file_write_nonblock(path: &Path) -> Result { +/// Create and open a file for both reading and writing to it with a O_NONBLOCK flag. +/// In case we open a FIFO, we need all READ, WRITE and O_NONBLOCK in order to not block the process +/// if nobody is consuming the message. Otherwise opening the FIFO with only WRITE and O_NONBLOCK +/// will fail with ENXIO if there is no reader already attached to it. +/// NOTE: writing to a pipe will start failing when reaching 64K of unconsumed content. +pub fn open_file_nonblock(path: &Path) -> Result { OpenOptions::new() .custom_flags(O_NONBLOCK) .create(true) + .read(true) .write(true) .open(path) } diff --git a/src/vmm/src/utils/pagemap.rs b/src/vmm/src/utils/pagemap.rs new file mode 100644 index 00000000000..fff9e1f5cb2 --- /dev/null +++ b/src/vmm/src/utils/pagemap.rs @@ -0,0 +1,115 @@ +//! Utilities for reading /proc/self/pagemap to track dirty pages. + +#![allow(clippy::cast_possible_wrap)] + +use std::fs::File; +use std::os::unix::io::AsRawFd; + +use crate::arch::host_page_size; + +const PAGEMAP_ENTRY_SIZE: usize = 8; + +/// Errors related to pagemap operations +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PagemapError { + /// Failed to open /proc/self/pagemap: {0} + OpenPagemap(#[source] std::io::Error), + /// Failed to read pagemap entry: {0} + ReadEntry(#[source] std::io::Error), + /// Failed to open /proc/self/clear_refs: {0} + OpenClearRefs(#[source] std::io::Error), + /// Failed to clear soft-dirty bits: {0} + ClearSoftDirty(#[source] std::io::Error), +} + +/// Represents a single entry in /proc/pid/pagemap. +/// +/// Each virtual page has an 8-byte entry with the following layout: +/// - Bits 0-54: Page frame number (PFN) if present +/// - Bit 55: Page is soft-dirty (written to since last clear) +/// - Bit 56: Page is exclusively mapped +/// - Bit 57: Page is write-protected via userfaultfd +/// - Bit 58: Unused +/// - Bit 59-60: Unused +/// - Bit 61: Page is file-page or shared-anon +/// - Bit 62: Page is swapped +/// - Bit 63: Page is present in RAM +#[derive(Debug, Clone, Copy)] +pub struct PagemapEntry { + raw: u64, +} + +impl PagemapEntry { + /// Create a PagemapEntry from bytes (little-endian) + pub fn from_bytes(bytes: [u8; 8]) -> Self { + Self { + raw: u64::from_ne_bytes(bytes), + } + } + + /// Check if page is write-protected via userfaultfd + pub fn is_write_protected(&self) -> bool { + (self.raw & (1u64 << 57)) != 0 + } + + /// Check if page is present in RAM (bit 63) + pub fn is_present(&self) -> bool { + (self.raw & (1u64 << 63)) != 0 + } +} + +/// Reader for /proc/self/pagemap +#[derive(Debug)] +pub struct PagemapReader { + pagemap_fd: File, +} + +impl PagemapReader { + /// Create a new PagemapReader + pub fn new(_page_size: usize) -> Result { + let pagemap_fd = File::open("/proc/self/pagemap").map_err(PagemapError::OpenPagemap)?; + + Ok(Self { pagemap_fd }) + } + + /// Check if a single page is dirty (write-protected bit cleared). + /// + /// Checks the first host page (4K) of the guest page at the given address. + /// For huge pages, all host pages within the huge page typically have the same + /// dirty status, so sampling the first is sufficient. + /// + /// # Arguments + /// * `virt_addr` - Virtual address of the page to check + /// + /// # Returns + /// True if the page is present and write-protected bit is cleared (dirty). + pub fn is_page_dirty(&self, virt_addr: usize) -> Result { + // Pagemap always uses host (4K) page size + let host_page_size = host_page_size(); + + // Calculate offset for this virtual page (using host page size) + let host_vpn = virt_addr / host_page_size; + let offset = (host_vpn * PAGEMAP_ENTRY_SIZE) as i64; + + let mut entry_bytes = [0u8; 8]; + + // SAFETY: pread is safe as long as the fd is valid and the buffer is properly sized + let ret = unsafe { + libc::pread( + self.pagemap_fd.as_raw_fd(), + entry_bytes.as_mut_ptr().cast(), + PAGEMAP_ENTRY_SIZE, + offset, + ) + }; + + if ret != PAGEMAP_ENTRY_SIZE as isize { + return Err(PagemapError::ReadEntry(std::io::Error::last_os_error())); + } + + let entry = PagemapEntry::from_bytes(entry_bytes); + + // Page must be present and the write_protected bit cleared (indicating it was written to) + Ok(entry.is_present() && !entry.is_write_protected()) + } +} diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs new file mode 100644 index 00000000000..693ece6b4d4 --- /dev/null +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -0,0 +1,29 @@ +use serde::Serialize; + +use crate::persist::GuestRegionUffdMapping; + +/// Serializeable struct that contains information about guest's memory mappings +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryMapingsResponse { + /// Vector with mappings from guest physical to host virtual memoryv + pub mappings: Vec, +} + +/// Information about guest memory resident pages and pages that are all-0s +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// Bitmap for resident pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is present in the resident memory set + pub resident: Vec, + /// Bitmap for empty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is empty (all 0s). + pub empty: Vec, +} + +/// Information about dirty guest memory pages +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryDirty { + /// Bitmap for dirty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page has been written since the last snapshot. + pub bitmap: Vec, +} diff --git a/src/vmm/src/vmm_config/metrics.rs b/src/vmm/src/vmm_config/metrics.rs index 9d44c35f6a3..38001661fc6 100644 --- a/src/vmm/src/vmm_config/metrics.rs +++ b/src/vmm/src/vmm_config/metrics.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; use crate::logger::{FcLineWriter, METRICS}; -use crate::utils::open_file_write_nonblock; +use crate::utils::open_file_nonblock; /// Strongly typed structure used to describe the metrics system. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] @@ -26,7 +26,7 @@ pub enum MetricsConfigError { /// Configures the metrics as described in `metrics_cfg`. pub fn init_metrics(metrics_cfg: MetricsConfig) -> Result<(), MetricsConfigError> { let writer = FcLineWriter::new( - open_file_write_nonblock(&metrics_cfg.metrics_path) + open_file_nonblock(&metrics_cfg.metrics_path) .map_err(|err| MetricsConfigError::InitializationFailure(err.to_string()))?, ); METRICS diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 9a4c104ce3a..c593b3ec0dc 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -20,6 +20,8 @@ pub mod entropy; pub mod instance_info; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for getting memory-related information. +pub mod meminfo; /// Wrapper for configuring memory hotplug. pub mod memory_hotplug; /// Wrapper for configuring the metrics. diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 13a87ba30c4..393ae945498 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,7 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + pub mem_file_path: Option, } /// Allows for changing the mapping between tap devices and host devices @@ -72,6 +72,10 @@ pub struct LoadSnapshotParams { pub resume_vm: bool, /// The network devices to override on load. pub network_overrides: Vec, + /// [x86_64 only] When set to true, passes `KVM_CLOCK_REALTIME` to `KVM_SET_CLOCK` on restore, + /// advancing kvmclock by the wall-clock time elapsed since the snapshot was taken. When false + /// (default), kvmclock resumes from where it was at snapshot time. + pub clock_realtime: bool, } /// Stores the configuration for loading a snapshot that is provided by the user. @@ -101,6 +105,9 @@ pub struct LoadSnapshotConfig { /// The network devices to override on load. #[serde(default)] pub network_overrides: Vec, + /// [x86_64 only] When set to true, passes `KVM_CLOCK_REALTIME` to `KVM_SET_CLOCK` on restore. + #[serde(default)] + pub clock_realtime: bool, } /// Stores the configuration used for managing snapshot memory. diff --git a/src/vmm/src/vstate/interrupts.rs b/src/vmm/src/vstate/interrupts.rs index 5246144d8f6..852086b3e6d 100644 --- a/src/vmm/src/vstate/interrupts.rs +++ b/src/vmm/src/vstate/interrupts.rs @@ -187,7 +187,7 @@ impl<'a> Persist<'a> for MsixVectorGroup { fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, - ) -> std::result::Result { + ) -> Result { let mut vectors = Vec::with_capacity(state.len()); for gsi in state { diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index ef160d9e918..846ed9d2688 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -59,6 +59,24 @@ pub enum MemoryError { Unaligned, /// Error protecting memory slot: {0} Mprotect(std::io::Error), + /// Size too large for i64 conversion + SlotSizeTooLarge, + /// Dirty bitmap not found for memory slot {0} + DirtyBitmapNotFound(u32), + /// Dirty bitmap is larger than the slot size + DirtyBitmapTooLarge, + /// Dirty bitmap is smaller than the slot size + DirtyBitmapTooSmall, + /// Seek error: {0} + SeekError(std::io::Error), + /// Volatile memory error: {0} + VolatileMemoryError(vm_memory::VolatileMemoryError), +} + +impl From for MemoryError { + fn from(e: vm_memory::VolatileMemoryError) -> Self { + MemoryError::VolatileMemoryError(e) + } } /// Type of the guest region @@ -121,25 +139,47 @@ impl<'a> GuestMemorySlot<'a> { writer: &mut T, kvm_bitmap: &[u64], page_size: usize, - ) -> Result<(), GuestMemoryError> { + ) -> Result<(), MemoryError> { let firecracker_bitmap = self.slice.bitmap(); let mut write_size = 0; let mut skip_size = 0; let mut dirty_batch_start = 0; + let expected_bitmap_array_len = (self.slice.len() / page_size).div_ceil(64); + if kvm_bitmap.len() > expected_bitmap_array_len { + return Err(MemoryError::DirtyBitmapTooLarge); + } else if kvm_bitmap.len() < expected_bitmap_array_len { + return Err(MemoryError::DirtyBitmapTooSmall); + } + for (i, v) in kvm_bitmap.iter().enumerate() { for j in 0..64 { let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; let page_offset = ((i * 64) + j) * page_size; let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + // We process 64 pages at a time, however the number of pages + // in the slot might not be a multiple of 64. We need to break + // once we go past the last page that is actually part of the + // region. + if page_offset >= self.slice.len() { + // Ensure there are no more dirty bits after this point + if (v >> j) != 0 { + return Err(MemoryError::DirtyBitmapTooLarge); + } + break; + } + if is_kvm_page_dirty || is_firecracker_page_dirty { // We are at the start of a new batch of dirty pages. if skip_size > 0 { // Seek forward over the unmodified pages. + let offset = skip_size + .try_into() + .map_err(|_| MemoryError::SlotSizeTooLarge)?; writer - .seek(SeekFrom::Current(skip_size.try_into().unwrap())) - .unwrap(); + .seek(SeekFrom::Current(offset)) + .map_err(MemoryError::SeekError)?; dirty_batch_start = page_offset; skip_size = 0; } @@ -161,6 +201,14 @@ impl<'a> GuestMemorySlot<'a> { writer.write_all_volatile(&self.slice.subslice(dirty_batch_start, write_size)?)?; } + // Advance the cursor even if the trailing pages are clean, so that the + // next slot starts writing at the correct offset. + if skip_size > 0 { + writer + .seek(SeekFrom::Current(skip_size.try_into().unwrap())) + .map_err(MemoryError::SeekError)?; + } + Ok(()) } @@ -187,14 +235,6 @@ impl<'a> GuestMemorySlot<'a> { } } -fn addr_in_range(addr: GuestAddress, start: GuestAddress, len: usize) -> bool { - if let Some(end) = start.checked_add(len as u64) { - addr >= start && addr < end - } else { - false - } -} - impl GuestRegionMmapExt { /// Adds a DRAM region which only contains a single plugged slot pub(crate) fn dram_from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { @@ -297,11 +337,17 @@ impl GuestRegionMmapExt { len: usize, ) -> impl Iterator> { self.slots().map(|(slot, _)| slot).filter(move |slot| { - if let Some(slot_end) = slot.guest_addr.checked_add(slot.slice.len() as u64) { - addr_in_range(slot.guest_addr, from, len) || addr_in_range(slot_end, from, len) - } else { - false - } + // Two intervals [a, b) and [c, d) intersect iff a < d && c < b. + // This correctly handles the containment case where the slot fully + // contains the range (or vice versa). + let slot_start = slot.guest_addr; + let Some(slot_end) = slot_start.checked_add(slot.slice.len() as u64) else { + return false; + }; + let Some(range_end) = from.checked_add(len as u64) else { + return false; + }; + slot_start < range_end && from < slot_end }) } @@ -668,10 +714,15 @@ impl GuestMemoryExtension for GuestMemoryMmap { .flat_map(|region| region.slots()) .try_for_each(|(mem_slot, plugged)| { if !plugged { - let ilen = i64::try_from(mem_slot.slice.len()).unwrap(); - writer.seek(SeekFrom::Current(ilen)).unwrap(); + let ilen = i64::try_from(mem_slot.slice.len()) + .map_err(|_| MemoryError::SlotSizeTooLarge)?; + writer + .seek(SeekFrom::Current(ilen)) + .map_err(MemoryError::SeekError)?; } else { - let kvm_bitmap = dirty_bitmap.get(&mem_slot.slot).unwrap(); + let kvm_bitmap = dirty_bitmap + .get(&mem_slot.slot) + .ok_or(MemoryError::DirtyBitmapNotFound(mem_slot.slot))?; mem_slot.dump_dirty(writer, kvm_bitmap, page_size)?; } Ok(()) @@ -683,7 +734,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { self.reset_dirty(); } - write_result.map_err(MemoryError::WriteMemory) + write_result } /// Resets all the memory region bitmaps @@ -814,6 +865,7 @@ mod tests { use std::collections::HashMap; use std::io::{Read, Seek, Write}; + use std::os::unix::fs::MetadataExt; use vmm_sys_util::tempfile::TempFile; @@ -1123,17 +1175,23 @@ mod tests { .write(&second_region, region_2_address) .unwrap(); + // Firecracker Dirty Bitmap after the writes: + // First region pages: [dirty, dirty] + // Second region pages: [dirty, dirty] + let memory_state = guest_memory.describe(); - // Dump only the dirty pages. + // KVM dirty bitmap: // First region pages: [dirty, clean] // Second region pages: [clean, dirty] - let mut dirty_bitmap: DirtyBitmap = HashMap::new(); - dirty_bitmap.insert(0, vec![0b01]); - dirty_bitmap.insert(1, vec![0b10]); + let mut kvm_dirty_bitmap: DirtyBitmap = HashMap::new(); + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); let mut file = TempFile::new().unwrap().into_file(); - guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); + guest_memory + .dump_dirty(&mut file, &kvm_dirty_bitmap) + .unwrap(); // We can restore from this because this is the first dirty dump. let restored_guest_memory = @@ -1158,18 +1216,25 @@ mod tests { let ones = vec![1u8; page_size]; let twos = vec![2u8; page_size]; - // Firecracker Bitmap - // First region pages: [dirty, clean] + // Firecracker Dirty Bitmap: + // First region pages: [clean, dirty] // Second region pages: [clean, clean] guest_memory .write(&twos, GuestAddress(page_size as u64)) .unwrap(); + // KVM dirty bitmap: + // First region pages: [dirty, clean] + // Second region pages: [clean, dirty] + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); - guest_memory.dump_dirty(&mut reader, &dirty_bitmap).unwrap(); + guest_memory + .dump_dirty(&mut reader, &kvm_dirty_bitmap) + .unwrap(); // Check that only the dirty regions are dumped. let mut diff_file_content = Vec::new(); - let expected_first_region = [ + let expected_file_contents = [ ones.as_slice(), twos.as_slice(), zeros.as_slice(), @@ -1178,7 +1243,71 @@ mod tests { .concat(); reader.seek(SeekFrom::Start(0)).unwrap(); reader.read_to_end(&mut diff_file_content).unwrap(); - assert_eq!(expected_first_region, diff_file_content); + assert_eq!(expected_file_contents, diff_file_content); + + // Take a 3rd snapshot + + // Firecracker Dirty Bitmap: + // First region pages: [dirty, clean] + // Second region pages: [dirty, clean] + guest_memory.write(&twos, region_1_address).unwrap(); + guest_memory.write(&ones, region_2_address).unwrap(); + // KVM dirty bitmap: + // First region pages: [clean, clean] + // Second region pages: [clean, clean] + kvm_dirty_bitmap.insert(0, vec![0b00]); + kvm_dirty_bitmap.insert(1, vec![0b00]); + + let file = TempFile::new().unwrap(); + let logical_size = page_size as u64 * 4; + file.as_file().set_len(logical_size).unwrap(); + + let mut reader = file.into_file(); + guest_memory + .dump_dirty(&mut reader, &kvm_dirty_bitmap) + .unwrap(); + + // Check that only the dirty regions are dumped. + let mut diff_file_content = Vec::new(); + // The resulting file is a sparse file with holes. + let expected_file_contents = [ + twos.as_slice(), + zeros.as_slice(), // hole + ones.as_slice(), + zeros.as_slice(), // hole + ] + .concat(); + reader.seek(SeekFrom::Start(0)).unwrap(); + reader.read_to_end(&mut diff_file_content).unwrap(); + + assert_eq!(expected_file_contents, diff_file_content); + + // Make sure that only 2 of the pages are written in the file and the + // other two are holes. + let metadata = reader.metadata().unwrap(); + let physical_size = metadata.blocks() * 512; + assert_eq!(physical_size, 2 * page_size as u64); + assert_ne!(physical_size, logical_size); + + // Test with bitmaps that are too large or too small + kvm_dirty_bitmap.insert(0, vec![0b1, 0b01]); + kvm_dirty_bitmap.insert(1, vec![0b10]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooLarge) + )); + kvm_dirty_bitmap.insert(0, vec![0b01]); + kvm_dirty_bitmap.insert(1, vec![0b110]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooLarge) + )); + kvm_dirty_bitmap.insert(0, vec![]); + kvm_dirty_bitmap.insert(1, vec![0b10]); + assert!(matches!( + guest_memory.dump_dirty(&mut reader, &kvm_dirty_bitmap), + Err(MemoryError::DirtyBitmapTooSmall) + )); } #[test] @@ -1330,4 +1459,57 @@ mod tests { GuestMemoryError::IOError(_) ); } + + /// Verifies that `slots_intersecting_range` returns the correct slots for + /// ranges at slot boundaries, interior to a slot, and spanning two slots. + #[test] + fn test_slots_intersecting_range() { + let page_size = get_page_size().unwrap(); + let slot_size = 4 * page_size; + let region_size = 2 * slot_size; + let base = GuestAddress(0); + let slot1_base = base.unchecked_add(slot_size as u64); + + let mmap_region = anonymous( + std::iter::once((base, region_size)), + false, + HugePageConfig::None, + ) + .unwrap() + .into_iter() + .next() + .unwrap(); + + let region = GuestRegionMmapExt::hotpluggable_from_mmap_region(mmap_region, 0, slot_size); + assert_eq!(region.slot_cnt(), 2); + + // (range_offset_in_pages, range_len_in_pages, expected_slot_addrs) + let cases: &[(usize, usize, &[GuestAddress])] = &[ + // At slot 0 boundary + (0, 1, &[base]), + // Interior to slot 0 + (1, 1, &[base]), + // Interior to slot 1 + (5, 1, &[slot1_base]), + // Spanning slot 0 and slot 1 + (3, 2, &[base, slot1_base]), + // Entire region + (0, 8, &[base, slot1_base]), + // Outside the region + (8, 1, &[]), + // Zero-length range + (0, 0, &[]), + ]; + + for &(offset_pages, len_pages, expected) in cases { + let from = base.unchecked_add((offset_pages * page_size) as u64); + let len = len_pages * page_size; + let found: Vec<_> = region.slots_intersecting_range(from, len).collect(); + let addrs: Vec<_> = found.iter().map(|s| s.guest_addr).collect(); + assert_eq!( + addrs, expected, + "offset={offset_pages} pages, len={len_pages} pages" + ); + } + } } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 83e899eff1d..0ecf2ef94b7 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -305,7 +305,7 @@ impl Vm { } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn get_dirty_bitmap(&self) -> Result { + pub fn get_dirty_bitmap(&self, page_size: usize) -> Result { self.guest_memory() .iter() .flat_map(|region| region.plugged_slots()) @@ -318,6 +318,7 @@ impl Vm { None => mincore_bitmap( mem_slot.slice.ptr_guard_mut().as_ptr(), mem_slot.slice.len(), + page_size, )?, }; Ok((mem_slot.slot, bitmap)) @@ -335,6 +336,7 @@ impl Vm { &self, mem_file_path: &Path, snapshot_type: SnapshotType, + page_size: usize, ) -> Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; @@ -377,7 +379,7 @@ impl Vm { match snapshot_type { SnapshotType::Diff => { - let dirty_bitmap = self.get_dirty_bitmap()?; + let dirty_bitmap = self.get_dirty_bitmap(page_size)?; self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { @@ -503,7 +505,11 @@ impl Vm { /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used /// if a diff snapshot is requested, but dirty page tracking wasn't enabled. -fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { +pub(crate) fn mincore_bitmap( + addr: *mut u8, + len: usize, + page_size: usize, +) -> Result, VmError> { // TODO: Once Host 5.10 goes out of support, we can make this more robust and work on // swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to // force swapped-out pages to get paged in, so that mincore will consider them incore). @@ -513,8 +519,11 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { // Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with // is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will // give us 512 4k markers with the lowest bit set). - let page_size = host_page_size(); - let mut mincore_bitmap = vec![0u8; len / page_size]; + let host_page_size = host_page_size(); + let mut mincore_bitmap = vec![0u8; len / host_page_size]; + // The bitmap we return though tracks pages in terms of the actually used page size. In + // the case of a hugetlbfs VMA, we just need to check if the first of the reported pages + // is present. let mut bitmap = vec![0u64; (len / page_size).div_ceil(64)]; // SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid @@ -529,7 +538,8 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { return Err(VmError::Mincore(vmm_sys_util::errno::Error::last())); } - for (page_idx, b) in mincore_bitmap.iter().enumerate() { + let step = page_size / host_page_size; + for (page_idx, b) in mincore_bitmap.iter().step_by(step).enumerate() { bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64); } @@ -885,7 +895,7 @@ pub(crate) mod tests { let restored_state: VmState = Snapshot::load_without_crc_check(snapshot_data.as_slice()) .unwrap() .data; - vm.restore_state(&restored_state).unwrap(); + vm.restore_state(&restored_state, false).unwrap(); let mut resource_allocator = vm.resource_allocator(); let gsi_new = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6a5e6a08a14..4d09fe62b0f 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -235,7 +235,7 @@ fn verify_create_snapshot( let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller @@ -302,6 +302,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { track_dirty_pages: false, resume_vm: true, network_overrides: vec![], + clock_realtime: false, })) .unwrap(); @@ -386,6 +387,7 @@ fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &s track_dirty_pages: false, resume_vm: false, network_overrides: vec![], + clock_realtime: false, }); let err = preboot_api_controller.handle_preboot_request(req); assert!( diff --git a/tests/conftest.py b/tests/conftest.py index 7c777eb6abd..575e76be272 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -259,6 +259,18 @@ def change_net_config_space_bin(test_fc_session_root_path): yield change_net_config_space_bin +@pytest.fixture(scope="session") +def devmem_bin(test_fc_session_root_path): + """Build a minimal /dev/mem read/write tool.""" + bin_path = os.path.join(test_fc_session_root_path, "devmem") + build_tools.gcc_compile( + "host_tools/devmem.c", + bin_path, + extra_flags="-static", + ) + yield bin_path + + @pytest.fixture(scope="session") def waitpkg_bin(test_fc_session_root_path): """Build a binary that attempts to use WAITPKG (UMONITOR / UMWAIT)""" diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index c8babc9f54b..7ba32305187 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -1076,6 +1076,7 @@ def restore_from_snapshot( snapshot: Snapshot, resume: bool = False, rename_interfaces: dict = None, + clock_realtime: bool = False, *, uffd_handler_name: str = None, ): @@ -1132,6 +1133,9 @@ def restore_from_snapshot( # can be inline in the snapshot_load command below optional_kwargs["network_overrides"] = iface_overrides + if clock_realtime: + optional_kwargs["clock_realtime"] = clock_realtime + self.api.snapshot_load.put( mem_backend=mem_backend, snapshot_path=str(jailed_vmstate), @@ -1286,12 +1290,17 @@ def build(self, kernel=None, rootfs=None, **kwargs): vm.ssh_key = ssh_key return vm - def build_from_snapshot(self, snapshot: Snapshot, uffd_handler_name=None): + def build_from_snapshot( + self, snapshot: Snapshot, uffd_handler_name=None, clock_realtime=False + ): """Build a microvm from a snapshot""" vm = self.build() vm.spawn() vm.restore_from_snapshot( - snapshot, resume=True, uffd_handler_name=uffd_handler_name + snapshot, + resume=True, + uffd_handler_name=uffd_handler_name, + clock_realtime=clock_realtime, ) return vm diff --git a/tests/host_tools/devmem.c b/tests/host_tools/devmem.c new file mode 100644 index 00000000000..49d1fc17438 --- /dev/null +++ b/tests/host_tools/devmem.c @@ -0,0 +1,93 @@ +// Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Minimal /dev/mem read/write tool for integration tests. +// +// Usage: +// devmem read +// devmem write +// +// : physical address (hex or decimal) +// : access width in bytes (1, 2, or 4) +// : value to write (hex or decimal, write only) +// +// On read, prints the value as a hex number to stdout. +// On write, writes the value then reads back and prints it. +// Exit code 0 on success, non-zero on failure. + +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 4) { + fprintf(stderr, + "Usage: %s read \n" + " %s write \n", + argv[0], argv[0]); + return 1; + } + + int is_write = strcmp(argv[1], "write") == 0; + if (is_write && argc < 5) { + fprintf(stderr, "write mode requires a value argument\n"); + return 1; + } + + uint64_t addr = strtoull(argv[2], NULL, 0); + int width = atoi(argv[3]); + uint64_t value = is_write ? strtoull(argv[4], NULL, 0) : 0; + + if (width != 1 && width != 2 && width != 4) { + fprintf(stderr, "width must be 1, 2, or 4\n"); + return 1; + } + + int fd = open("/dev/mem", O_RDWR | O_SYNC); + if (fd < 0) { + perror("open /dev/mem"); + return 1; + } + + uint64_t page_size = getpagesize(); + uint64_t page_addr = addr & ~(page_size - 1); + uint64_t offset_in_page = addr & (page_size - 1); + uint64_t map_size = page_size; + if (offset_in_page + width > page_size) + map_size *= 2; + + void *map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, page_addr); + if (map == MAP_FAILED) { + perror("mmap"); + close(fd); + return 1; + } + + volatile void *ptr = (volatile char *)map + offset_in_page; + + if (is_write) { + switch (width) { + case 1: *(volatile uint8_t *)ptr = (uint8_t)value; break; + case 2: *(volatile uint16_t *)ptr = (uint16_t)value; break; + case 4: *(volatile uint32_t *)ptr = (uint32_t)value; break; + } + } + + uint32_t result = 0; + switch (width) { + case 1: result = *(volatile uint8_t *)ptr; break; + case 2: result = *(volatile uint16_t *)ptr; break; + case 4: result = *(volatile uint32_t *)ptr; break; + } + + printf("0x%x\n", result); + + munmap(map, map_size); + close(fd); + return 0; +} diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py index dc0827b1aae..e4e26f4552e 100644 --- a/tests/integration_tests/functional/test_pci.py +++ b/tests/integration_tests/functional/test_pci.py @@ -2,6 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for the PCI devices""" +# Virtio PCI common config register offsets +# https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-1420003 +COMMON_CFG_QUEUE_SELECT = 0x16 # u16 +COMMON_CFG_QUEUE_SIZE = 0x18 # u16 +COMMON_CFG_QUEUE_ENABLE = 0x1C # u16 +COMMON_CFG_QUEUE_DESC_LO = 0x20 # u32 +COMMON_CFG_QUEUE_DESC_HI = 0x24 # u32 +COMMON_CFG_QUEUE_AVAIL_LO = 0x28 # u32 +COMMON_CFG_QUEUE_AVAIL_HI = 0x2C # u32 +COMMON_CFG_QUEUE_USED_LO = 0x30 # u32 +COMMON_CFG_QUEUE_USED_HI = 0x34 # u32 + def test_pci_root_present(uvm_any_with_pci): """ @@ -26,3 +38,97 @@ def test_pci_disabled(uvm_any_without_pci): assert ( "00:00.0 Host bridge: Intel Corporation Device" not in stdout ), "PCI root not found in guest" + + +def _find_virtio_blk_bar(vm): + """Find the BAR0 physical address of the first virtio-blk PCI device. + + virtio-blk has PCI device ID 0x1042 (0x1040 + type 2). + + Example:: + + # lspci -n + 00:00.0 0600: 8086:0d57 + 00:01.0 0180: 1af4:1042 (rev 01) + + The resource file has one line per BAR. Each line contains three + space-separated hex values: start, end, flags. + + Example (BAR0 line):: + + # cat /sys/bus/pci/devices/0000:00:01.0/resource | head -1 + 0x0000004000000000 0x000000400007ffff 0x0000000000140204 + """ + stdout = vm.ssh.check_output("lspci -n").stdout.strip() + slot = None + for line in stdout.split("\n"): + parts = line.split() + if len(parts) >= 3 and parts[2] == "1af4:1042": + slot = f"0000:{parts[0]}" + break + assert slot is not None, "No virtio-blk PCI device found" + + cmd = f"cat /sys/bus/pci/devices/{slot}/resource | head -1" + stdout = vm.ssh.check_output(cmd).stdout.strip() + addr = int(stdout.split()[0], 16) + assert addr != 0, f"BAR0 address is 0 for {slot}" + return addr + + +def _devmem_read(vm, tool_path, addr, width): + """Read a physical address via /dev/mem.""" + cmd = f"{tool_path} read 0x{addr:x} {width}" + stdout = vm.ssh.check_output(cmd).stdout.strip() + return int(stdout, 16) + + +def _devmem_write(vm, tool_path, addr, width, value): + """Write a physical address via /dev/mem and return the read-back value.""" + cmd = f"{tool_path} write 0x{addr:x} {width} 0x{value:x}" + stdout = vm.ssh.check_output(cmd).stdout.strip() + return int(stdout, 16) + + +def test_queue_config_immutable(uvm_any_with_pci, devmem_bin): + """ + Test that queue configuration fields cannot be modified by the guest + after the device has been activated (DRIVER_OK is set). + + All PCI common config queue fields are read-write, so we can verify + immutability by writing a poison value and checking the readback still + equals the original. + + MMIO queue config immutability is covered by the Rust unit test + test_queue_config_immutable_after_activation in transport/mmio.rs. + MMIO queue fields are write-only (reads return 0), so integration-level + readback verification via /dev/mem is not possible. + """ + vm = uvm_any_with_pci + + rmt_path = "/tmp/devmem" + vm.ssh.scp_put(devmem_bin, rmt_path) + vm.ssh.check_output(f"chmod +x {rmt_path}") + + bar_addr = _find_virtio_blk_bar(vm) + + # Select queue 0 + _devmem_write(vm, rmt_path, bar_addr + COMMON_CFG_QUEUE_SELECT, 2, 0) + + # (name, offset, width, poison_value) + queue_fields = [ + ("queue_size", COMMON_CFG_QUEUE_SIZE, 2, 0), + ("queue_enable", COMMON_CFG_QUEUE_ENABLE, 2, 0), + ("queue_desc_lo", COMMON_CFG_QUEUE_DESC_LO, 4, 0xDEADBEEF), + ("queue_desc_hi", COMMON_CFG_QUEUE_DESC_HI, 4, 0xDEADBEEF), + ("queue_avail_lo", COMMON_CFG_QUEUE_AVAIL_LO, 4, 0xDEADBEEF), + ("queue_avail_hi", COMMON_CFG_QUEUE_AVAIL_HI, 4, 0xDEADBEEF), + ("queue_used_lo", COMMON_CFG_QUEUE_USED_LO, 4, 0xDEADBEEF), + ("queue_used_hi", COMMON_CFG_QUEUE_USED_HI, 4, 0xDEADBEEF), + ] + for name, offset, width, poison in queue_fields: + addr = bar_addr + offset + orig = _devmem_read(vm, rmt_path, addr, width) + readback = _devmem_write(vm, rmt_path, addr, width, poison) + assert ( + readback == orig + ), f"{name} should remain {orig:#x} after DRIVER_OK, got {readback:#x}" diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index bd9f1ec0d9b..dc3a1f761dd 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -410,14 +410,15 @@ def test_create_large_diff_snapshot(uvm_plain): # process would have been taken down. -def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory): +@pytest.mark.parametrize("mem_size", [256, 4096]) +def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory, mem_size): """ Tests that if we take a diff snapshot and direct firecracker to write it on top of an existing snapshot file, it will successfully merge them. """ basevm = uvm_plain_any basevm.spawn() - basevm.basic_config(track_dirty_pages=True) + basevm.basic_config(track_dirty_pages=True, mem_size_mib=mem_size) basevm.add_net_iface() basevm.start() @@ -583,3 +584,105 @@ def test_snapshot_rename_interface(uvm_nano, microvm_factory): rename_interfaces={iface_override.dev_name: iface_override.tap_name}, resume=True, ) + + +SLEEP_SECONDS = 30 + +CLOCK_SOURCES = {"x86_64": ["tsc", "kvm-clock"], "aarch64": ["arch_sys_counter"]}[ + global_props.cpu_architecture +] + + +def read_guest_monotonic(vm): + """Read CLOCK_MONOTONIC inside the guest""" + _, stdout, _ = vm.ssh.check_output( + "python3 -c 'import time; print(time.monotonic())'" + ) + return float(stdout.strip()) + + +def read_guest_clocksource(vm): + """Read the active clocksource inside the guest""" + _, stdout, _ = vm.ssh.check_output( + "cat /sys/devices/system/clocksource/clocksource0/current_clocksource" + ) + return stdout.strip() + + +@pytest.mark.parametrize("clocksource", CLOCK_SOURCES) +@pytest.mark.parametrize("clock_realtime", [False, True]) +def test_clocksource_snapshot_restore( + uvm_plain_any, microvm_factory, clocksource, clock_realtime +): + """Measure CLOCK_MONOTONIC before snapshot and after restore to determine + whether the clocksource jumps forward or resumes from where it left off.""" + + if clock_realtime and clocksource != "kvm-clock": + pytest.skip(f"Clocksource {clocksource} doesn't support clock_realtime flag") + if clock_realtime and global_props.host_linux_version_tpl < (5, 16): + pytest.skip("clock_realtime is not supported on Linux < 5.16") + + boot_args = ( + "reboot=k panic=1 nomodule swiotlb=noforce console=ttyS0" + f" clocksource={clocksource}" + ) + + vm = uvm_plain_any + vm.spawn() + vm.basic_config(vcpu_count=2, mem_size_mib=256, boot_args=boot_args) + vm.add_net_iface() + vm.start() + + # Confirm the clocksource took effect + active = read_guest_clocksource(vm) + _, avail_out, _ = vm.ssh.check_output( + "cat /sys/devices/system/clocksource/clocksource0/available_clocksource" + ) + print("Available clocksources: %s", avail_out.strip()) + if active != clocksource: + pytest.skip(f"Clocksource {clocksource} not available") + + guest_before = read_guest_monotonic(vm) + host_before = time.monotonic() + + snapshot = vm.snapshot_full() + vm.kill() + + print("Sleeping %ds between snapshot and restore...", SLEEP_SECONDS) + time.sleep(SLEEP_SECONDS) + + restored_vm = microvm_factory.build_from_snapshot( + snapshot, clock_realtime=clock_realtime + ) + + guest_after = read_guest_monotonic(restored_vm) + host_after = time.monotonic() + + # Confirm clocksource survived the restore + active_after = read_guest_clocksource(restored_vm) + assert ( + active_after == clocksource + ), f"Clocksource changed after restore: {clocksource} -> {active_after}" + + guest_delta = guest_after - guest_before + host_delta = host_after - host_before + + # If guest_delta is close to host_delta, the clock jumped forward + # (suspend/resume behavior). If it's near 0, it resumed from where + # it left off. + jumped = abs(guest_delta - host_delta) < 5.0 + + jumped_str = "JUMPED" if jumped else "RESUMED" + + print( + f"Host kernel: {global_props.host_linux_version}\n" + f"Clocksource: {clocksource}\n" + f"Guest MONOTONIC before: {guest_before:.3f} s\n" + f"Guest MONOTONIC after: {guest_after:.3f} s\n" + f"Guest delta: {guest_delta:.3f} s\n" + f"Host delta: {host_delta:.3f} s\n" + f"Behavior: {jumped_str}\n" + ) + assert ( + jumped == clock_realtime + ), f"Clock {jumped_str} but clock_realtime was {"not" if clock_realtime else ""} set." diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index b787196f6f5..a951332e1df 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -17,7 +17,8 @@ from framework.microvm import MicroVMFactory from framework.properties import global_props -CHECKER_URL = "https://raw.githubusercontent.com/speed47/spectre-meltdown-checker/master/spectre-meltdown-checker.sh" +# Pinned due to issues introduced in https://github.com/speed47/spectre-meltdown-checker/pull/527 +CHECKER_URL = "https://raw.githubusercontent.com/speed47/spectre-meltdown-checker/3a822fdcf291ebb8bfbcb77aa216ac342c6b2f12/spectre-meltdown-checker.sh" CHECKER_FILENAME = "spectre-meltdown-checker.sh" REMOTE_CHECKER_PATH = f"/tmp/{CHECKER_FILENAME}" REMOTE_CHECKER_COMMAND = f"sh {REMOTE_CHECKER_PATH} --no-intel-db --batch json" diff --git a/tools/devtool b/tools/devtool index 4aaa37950e0..ee9299ea772 100755 --- a/tools/devtool +++ b/tools/devtool @@ -133,6 +133,9 @@ TARGET_PREFIX="$(uname -m)-unknown-linux-" # Container path to directory where we store built CI artifacts. CTR_CI_ARTIFACTS_PATH="${CTR_FC_ROOT_DIR}/resources/$(uname -m)" +# Lockfile used while modifying KVM modules +KVM_MODULE_LOCKFILE="/tmp/.kvm_module_lock" + # Check if Docker is available and exit if it's not. # Upon returning from this call, the caller can be certain Docker is available. # @@ -583,52 +586,127 @@ ensure_ci_artifacts() { fi } -apply_linux_61_tweaks() { - KV=$(uname -r) - if [[ $KV != 6.1.* ]] || [ $(uname -m) != x86_64 ]; then - return +# Acquire the KVM module lock and run the given command. +# Uses flock with a timeout for safe, automatic lock management. +# Usage: with_kvm_module_lock [args...] +with_kvm_module_lock() { + local LOCK_TIMEOUT=120 + ( + if ! flock -w "$LOCK_TIMEOUT" 9; then + say_warn "Timed out waiting for KVM module lock after: ${LOCK_TIMEOUT}s" + exit 1 + fi + echo "Successfully acquired lock" + "$@" + ) 9>"$KVM_MODULE_LOCKFILE" +} + +# Reload KVM modules with the given vendor module and kvm params. +# Always enables avic=1 on AMD. Unloads first if already loaded. +# Usage: reload_kvm_modules [kvm_param...] +# e.g. reload_kvm_modules kvm_intel nx_huge_pages=never +reload_kvm_modules() { + local vendor_mod=$1; shift + + # Unload if already loaded + if lsmod | grep -qP "^kvm_(amd|intel)"; then + if ! sudo modprobe -r $vendor_mod kvm; then + say_warn "Failed to unload KVM modules (${vendor_mod}, kvm) (may be in use)" + return 1 + fi fi - say "Applying Linux 6.1 boot-time regression mitigations" - - KVM_VENDOR_MOD=$(lsmod |grep -P "^kvm_(amd|intel)" | awk '{print $1}') - ITLB_MULTIHIT=/sys/devices/system/cpu/vulnerabilities/itlb_multihit - NX_HUGEPAGES=/sys/module/kvm/parameters/nx_huge_pages - - # If m6a/m6i - if grep -q "Not affected" $ITLB_MULTIHIT; then - echo -e "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" - # we need a lock so another process is not running the same thing and to - # avoid race conditions. - lockfile="/tmp/.linux61_tweaks.lock" - set -C # noclobber - while true; do - if echo "$$" > "$lockfile"; then - echo "Successfully acquired lock" - if ! grep -q "never" $NX_HUGEPAGES; then - echo "Reloading KVM modules with nx_huge_pages=never" - sudo modprobe -r $KVM_VENDOR_MOD kvm - sudo modprobe kvm nx_huge_pages=never - sudo modprobe $KVM_VENDOR_MOD - fi - rm "$lockfile" - break - else - sleep 5s - fi - done - tail -v $ITLB_MULTIHIT $NX_HUGEPAGES - # else (m5d Skylake and CascadeLake) + + if ! sudo modprobe kvm "$@"; then + say_warn "Failed to load kvm module" + return 1 + fi + if [[ $vendor_mod == "kvm_amd" ]]; then + if ! sudo modprobe kvm_amd avic=1; then + say_warn "Failed to load kvm_amd module" + return 1 + fi else - echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" - mount |grep cgroup |grep -q favordynmods - if [ $? -ne 0 ]; then - say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" - else - echo "favordynmods is enabled" + if ! sudo modprobe $vendor_mod; then + say_warn "Failed to load $vendor_mod module" + return 1 fi fi } +# Determine the KVM vendor module for the current CPU. +kvm_vendor_mod() { + if grep -q "vmx" /proc/cpuinfo; then + echo kvm_intel + elif grep -q "svm" /proc/cpuinfo; then + echo kvm_amd + else + # aarch64 + echo kvm + fi +} + +# Ensure /dev/kvm is available and apply platform-specific KVM tweaks. +# - Loads KVM modules if not present +# - On Linux 6.1 x86_64: applies nx_huge_pages=never for non-vulnerable CPUs, +# checks favordynmods for vulnerable ones +# - On AMD: ensures AVIC is enabled +setup_kvm() { + local kernel_version=$(uname -r) + local arch=$(uname -m) + local vendor_mod=$(kvm_vendor_mod) + + local need_kvm_reload=0 + local kvm_extra_params=() + + # Load KVM if not already available + if [[ ! -c /dev/kvm ]]; then + need_kvm_reload=1 + fi + + local itlb_multihit=/sys/devices/system/cpu/vulnerabilities/itlb_multihit + local nx_huge_pages=/sys/module/kvm/parameters/nx_huge_pages + # Linux 6.1 x86_64: mitigate boot-time regression + if [[ $kernel_version == 6.1.* ]] && [[ $arch == x86_64 ]]; then + + say "Applying Linux 6.1 boot-time regression mitigations" + if grep -q "Not affected" $itlb_multihit; then + echo "CPU not vulnerable to iTLB multihit, using kvm.nx_huge_pages=never mitigation" + if ! grep -q "never" $nx_huge_pages 2>/dev/null; then + kvm_extra_params+=(nx_huge_pages=never) + need_kvm_reload=1 + fi + else + echo "CPU vulnerable to iTLB_multihit, checking if favordynmods is enabled" + if mount | grep cgroup | grep -q favordynmods; then + echo "favordynmods is enabled" + else + say_warn "cgroups' favordynmods option not enabled; VM creation performance may be impacted" + fi + fi + fi + + # AMD: ensure AVIC is enabled + local avic_param=/sys/module/kvm_amd/parameters/avic + if [[ $vendor_mod == "kvm_amd" ]]; then + if ! grep -q "Y\|1" $avic_param; then + echo "AVIC not enabled, will reload kvm_amd with avic=1" + need_kvm_reload=1 + fi + fi + + if [[ $need_kvm_reload -eq 1 ]]; then + echo "Reloading KVM modules" + reload_kvm_modules "$vendor_mod" "${kvm_extra_params[@]}" + ok_or_die "Could not reload kvm modules" + fi + + tail -v $itlb_multihit $nx_huge_pages + if [[ $vendor_mod == "kvm_amd" ]]; then + tail -v $avic_param + fi + + [[ -c /dev/kvm ]] || die "/dev/kvm not found. Aborting." +} # Modifies the processors CPU governor and P-state configuration (x86_64 only) for consistent performance. This means # - Disable turbo boost (Intel only) by writing 1 to /sys/devices/system/cpu/intel_pstate/no_turbo @@ -722,7 +800,7 @@ cmd_test() { done # Check prerequisites. - [ $do_kvm_check != 0 ] && ensure_kvm + [ $do_kvm_check != 0 ] && with_kvm_module_lock setup_kvm ensure_devctr ensure_build_dir ensure_ci_artifacts @@ -734,8 +812,6 @@ cmd_test() { fi fi - apply_linux_61_tweaks - # If we got to here, we've got all we need to continue. say "Kernel version: $(uname -r)" say "$(sed '/^processor.*: 0$/,/^processor.*: 1$/!d; /^processor.*: 1$/d' /proc/cpuinfo)" diff --git a/tools/functions b/tools/functions index 90e75c251bd..5268aab5ebc 100644 --- a/tools/functions +++ b/tools/functions @@ -125,15 +125,3 @@ function validate_version { die "Invalid version number: $version. Version should not contain \`wip\` or \`dirty\`." fi } - -######################### -# Firecracker functions # -######################### - -# Check if /dev/kvm exists. Exit if it doesn't. -# Upon returning from this call, the caller can be certain /dev/kvm is -# available. -# -ensure_kvm() { - [[ -c /dev/kvm ]] || die "/dev/kvm not found. Aborting." -}