From 786456646af2b94793735c3a0ed190e2c742b16f Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 2 May 2026 13:23:30 -0700 Subject: [PATCH 1/3] virt_kvm: add aarch64 irqfd routing Move irqfd route construction behind an architecture-specific builder so the shared KVM GSI and irqfd lifetime code can be used outside x86 MSI routing. x86 keeps using the existing MSI translation path, now rejecting invalid MSI addresses before installing routes. aarch64 partitions with a GICv2m frame can now expose irqfd support by translating valid SETSPI messages into irqchip routes. --- vm/vmcore/src/irqfd.rs | 17 +-- vmm_core/virt_kvm/src/arch/aarch64/mod.rs | 50 +++++++ vmm_core/virt_kvm/src/arch/x86_64/mod.rs | 89 ++++++++++--- vmm_core/virt_kvm/src/gsi.rs | 154 ++++++++++------------ vmm_core/virt_kvm/src/lib.rs | 5 +- 5 files changed, 198 insertions(+), 117 deletions(-) diff --git a/vm/vmcore/src/irqfd.rs b/vm/vmcore/src/irqfd.rs index 97d3e6219e..514ee03fb3 100644 --- a/vm/vmcore/src/irqfd.rs +++ b/vm/vmcore/src/irqfd.rs @@ -3,20 +3,21 @@ //! Traits for irqfd-based interrupt delivery. //! -//! irqfd allows a hypervisor to directly inject an MSI into a guest when an -//! event is signaled, without involving userspace in the interrupt delivery +//! irqfd allows a hypervisor to directly inject an interrupt into a guest when +//! an event is signaled, without involving userspace in the interrupt delivery //! path. This is used for device passthrough (e.g., VFIO) where the physical -//! device signals an event and the hypervisor injects the corresponding MSI -//! into the guest VM. +//! device signals an event and the hypervisor injects the corresponding +//! interrupt into the guest VM. use pal_event::Event; /// Trait for partitions that support irqfd-based interrupt delivery. /// /// An irqfd associates an event with a GSI (Global System Interrupt), and a -/// GSI routing table maps GSIs to MSI addresses and data values. When the -/// event is signaled, the kernel looks up the GSI routing and injects the -/// configured MSI into the guest without a usermode transition. +/// GSI routing table maps GSIs to architecture-specific interrupt message +/// routes. When the event is signaled, the kernel looks up the GSI routing and +/// injects the configured interrupt into the guest without a usermode +/// transition. pub trait IrqFd: Send + Sync { /// Creates a new irqfd route. /// @@ -47,7 +48,7 @@ pub trait IrqFdRoute: Send + Sync { /// is the event handle returned by `WHvCreateTrigger`. fn event(&self) -> &Event; - /// Sets the MSI routing for this irqfd's GSI. + /// Sets the interrupt routing for this irqfd's GSI. /// /// `address` and `data` are the MSI address and data values that the /// hypervisor will use when injecting the interrupt into the guest. diff --git a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs index 67c7135389..3b21f9a9ae 100644 --- a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs @@ -14,7 +14,11 @@ use crate::KvmError; use crate::KvmPartition; use crate::KvmPartitionInner; use crate::KvmRunVpError; +use crate::gsi::GsiRouting; +use crate::gsi::KvmIrqFdState; +use crate::gsi::MsiRouteBuilder; use aarch64defs::SystemReg; +use aarch64defs::gic::GicV2mRegister; use bitfield_struct::bitfield; use core::panic; use hvdef::Vtl; @@ -33,6 +37,7 @@ use kvm::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V2; use kvm::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; use kvm::kvm_regs; use kvm::user_pt_regs; +use parking_lot::Mutex; use std::convert::Infallible; use std::sync::Arc; use std::sync::atomic::AtomicBool; @@ -779,6 +784,7 @@ impl virt::ProtoPartition for KvmProtoPartition<'_> { eval: false.into(), }) .collect(), + gsi_routing: Mutex::new(GsiRouting::new()), caps, _gic_device: gic_device, gic_v2m: self.config.processor_topology.gic_v2m(), @@ -788,6 +794,7 @@ impl virt::ProtoPartition for KvmProtoPartition<'_> { let partition = KvmPartition { synic_ports: Arc::new(virt::synic::SynicPorts::new(partition.clone())), + irqfd_state: Arc::new(KvmIrqFdState::new(partition.clone())), inner: partition, }; @@ -834,6 +841,13 @@ impl virt::Partition for KvmPartition { ))) } + fn irqfd(&self) -> Option> { + if self.inner.gic_v2m.is_none() { + return None; + } + Some(self.irqfd_state.clone()) + } + fn request_yield(&self, vp_index: VpIndex) { let vp = &self.inner.vps[vp_index.index() as usize]; if vp.needs_yield.request_yield() { @@ -902,6 +916,42 @@ impl virt::Aarch64Partition for KvmPartition { } } +struct KvmGicV2mRouteBuilder; + +impl MsiRouteBuilder for KvmGicV2mRouteBuilder { + fn routing_entry( + &self, + partition: &KvmPartitionInner, + address: u64, + data: u32, + ) -> Option { + let v2m = partition + .gic_v2m + .as_ref() + .expect("partition does not expose a GICv2m MSI frame"); + let setspi_addr = v2m.frame_base + GicV2mRegister::SETSPI_NS.0 as u64; + if address != setspi_addr { + return None; + } + if !(v2m.spi_base..v2m.spi_base + v2m.spi_count).contains(&data) { + return None; + } + // KVM expects the data to be the GIC SPI number, not the interrupt ID. + // TODO: centralize this constant. + Some(kvm::RoutingEntry::Irqchip { pin: data - 32 }) + } +} + +impl virt::irqfd::IrqFd for KvmIrqFdState { + fn new_irqfd_route(&self) -> anyhow::Result> { + assert!( + self.partition.gic_v2m.is_some(), + "GICv2m is required for irqfd support" + ); + Ok(Box::new(self.new_irqfd_route(KvmGicV2mRouteBuilder)?)) + } +} + impl virt::PartitionAccessState for KvmPartition { type StateAccess<'a> = &'a KvmPartition; diff --git a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs index de46ffd93e..4c91f12b13 100644 --- a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs @@ -15,6 +15,8 @@ use crate::KvmPartitionInner; use crate::KvmProcessorBinder; use crate::KvmRunVpError; use crate::gsi::GsiRouting; +use crate::gsi::KvmIrqFdState; +use crate::gsi::MsiRouteBuilder; use guestmem::DoorbellRegistration; use guestmem::GuestMemory; use guestmem::GuestMemoryError; @@ -459,7 +461,7 @@ impl ProtoPartition for KvmProtoPartition<'_> { let partition = KvmPartition { synic_ports: Arc::new(virt::synic::SynicPorts::new(partition.clone())), - irqfd_state: Arc::new(crate::gsi::KvmIrqFdState::new(partition.clone())), + irqfd_state: Arc::new(KvmIrqFdState::new(partition.clone())), inner: partition, }; @@ -772,8 +774,12 @@ pub(crate) struct KvmMsi { } impl KvmMsi { - pub(crate) fn new(request: MsiRequest) -> Self { + pub(crate) fn new(request: MsiRequest) -> Option { + // TODO: validate the high bits of the request as well, across the codebase. let request_address = MsiAddress::from(request.address as u32); + if request_address.address() != x86defs::msi::MSI_ADDRESS { + return None; + } let request_data = MsiData::from(request.data); // Although architecturally the destination mode bit is only supposed to @@ -796,21 +802,29 @@ impl KvmMsi { .with_vector(request_data.vector()) .into(); - Self { + Some(Self { address_lo, address_hi, data, - } + }) } } impl KvmPartitionInner { fn request_msi(&self, request: MsiRequest) { - let KvmMsi { + let Some(KvmMsi { address_lo, address_hi, data, - } = KvmMsi::new(request); + }) = KvmMsi::new(request) + else { + tracelimit::warn_ratelimited!( + address = request.address, + data = request.data, + "invalid MSI address" + ); + return; + }; if let Err(err) = self.kvm.request_msi(&kvm::kvm_msi { address_lo, address_hi, @@ -829,20 +843,59 @@ impl KvmPartitionInner { } } +struct KvmX86MsiRouteBuilder; + +impl MsiRouteBuilder for KvmX86MsiRouteBuilder { + fn routing_entry( + &self, + _partition: &KvmPartitionInner, + address: u64, + data: u32, + ) -> Option { + let KvmMsi { + address_lo, + address_hi, + data, + } = KvmMsi::new(MsiRequest { address, data })?; + Some(kvm::RoutingEntry::Msi { + address_lo, + address_hi, + data, + }) + } +} + +impl virt::irqfd::IrqFd for KvmIrqFdState { + fn new_irqfd_route(&self) -> anyhow::Result> { + Ok(Box::new(self.new_irqfd_route(KvmX86MsiRouteBuilder)?)) + } +} + impl IoApicRouting for KvmPartitionInner { fn set_irq_route(&self, irq: u8, request: Option) { - let entry = request.map(|request| { - let KvmMsi { - address_lo, - address_hi, - data, - } = KvmMsi::new(request); - kvm::RoutingEntry::Msi { - address_lo, - address_hi, - data, - } - }); + let entry = match request { + Some(request) => match KvmMsi::new(request) { + Some(KvmMsi { + address_lo, + address_hi, + data, + }) => Some(kvm::RoutingEntry::Msi { + address_lo, + address_hi, + data, + }), + None => { + tracelimit::warn_ratelimited!( + irq, + address = request.address, + data = request.data, + "invalid MSI address for IO-APIC route" + ); + None + } + }, + None => None, + }; let mut gsi_routing = self.gsi_routing.lock(); if gsi_routing.set(irq as u32, entry) { gsi_routing.update_routes(&self.kvm); diff --git a/vmm_core/virt_kvm/src/gsi.rs b/vmm_core/virt_kvm/src/gsi.rs index ee2bd2d91b..d6c4cbf1da 100644 --- a/vmm_core/virt_kvm/src/gsi.rs +++ b/vmm_core/virt_kvm/src/gsi.rs @@ -12,7 +12,6 @@ use std::sync::Arc; use std::sync::Weak; use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; -use virt::irqfd::IrqFd; use virt::irqfd::IrqFdRoute; const NUM_GSIS: usize = 2048; @@ -32,6 +31,7 @@ impl GsiRouting { } /// Claims a specific GSI. + #[cfg_attr(guest_arch = "aarch64", expect(dead_code))] pub fn claim(&mut self, gsi: u32) { let gsi = gsi as usize; assert_eq!(self.states[gsi], GsiState::Unallocated); @@ -83,14 +83,16 @@ impl GsiRouting { impl KvmPartitionInner { /// Reserves a new route, optionally with an associated irqfd event. - pub(crate) fn new_route(self: &Arc, irqfd_event: Option) -> Option { + fn new_route(self: &Arc, irqfd_event: Option) -> Option { let gsi = self.gsi_routing.lock().alloc()?; Some(GsiRoute { partition: Arc::downgrade(self), - gsi, - irqfd_event, - enabled: false.into(), - enable_mutex: Mutex::new(()), + inner: GsiRouteInner { + gsi, + irqfd_event, + enabled: false.into(), + enable_mutex: Mutex::new(()), + }, }) } } @@ -109,9 +111,12 @@ impl GsiState { } /// A GSI route. -#[derive(Debug)] -pub struct GsiRoute { +struct GsiRoute { partition: Weak, + inner: GsiRouteInner, +} + +struct GsiRouteInner { gsi: u32, irqfd_event: Option, enabled: AtomicBool, @@ -120,32 +125,28 @@ pub struct GsiRoute { impl Drop for GsiRoute { fn drop(&mut self) { - self.disable(); - self.set_entry(None); if let Some(partition) = self.partition.upgrade() { - partition.gsi_routing.lock().free(self.gsi); + self.inner.disable(&partition); + self.inner.set_entry(&partition, None); + partition.gsi_routing.lock().free(self.inner.gsi); } } } -impl GsiRoute { - fn set_entry(&self, new_entry: Option) -> Option> { - let partition = self.partition.upgrade(); - if let Some(partition) = &partition { - let mut routing = partition.gsi_routing.lock(); - if routing.set(self.gsi, new_entry) { - routing.update_routes(&partition.kvm); - } +impl GsiRouteInner { + fn set_entry(&self, partition: &KvmPartitionInner, new_entry: Option) { + let mut routing = partition.gsi_routing.lock(); + if routing.set(self.gsi, new_entry) { + routing.update_routes(&partition.kvm); } - partition } /// Enables the route and associated irqfd. - pub fn enable(&self, entry: kvm::RoutingEntry) { + pub fn enable(&self, partition: &KvmPartitionInner, entry: kvm::RoutingEntry) { let _lock = self.enable_mutex.lock(); - let partition = self.set_entry(Some(entry)); + self.set_entry(partition, Some(entry)); if !self.enabled.load(Ordering::Relaxed) { - if let (Some(partition), Some(event)) = (&partition, &self.irqfd_event) { + if let Some(event) = &self.irqfd_event { partition .kvm .irqfd(self.gsi, event.as_fd().as_raw_fd(), true) @@ -159,74 +160,43 @@ impl GsiRoute { /// /// This actually leaves the route configured, but it disables the irqfd and /// clears the `enabled` bool so that `signal` won't. - pub fn disable(&self) { + pub fn disable(&self, partition: &KvmPartitionInner) { let _lock = self.enable_mutex.lock(); if self.enabled.load(Ordering::Relaxed) { if let Some(irqfd_event) = &self.irqfd_event { - if let Some(partition) = self.partition.upgrade() { - partition - .kvm - .irqfd(self.gsi, irqfd_event.as_fd().as_raw_fd(), false) - .expect("should not fail"); - } - } - self.enabled.store(false, Ordering::Relaxed); - } - } - - /// Returns the configured irqfd event, if there is one. - #[expect(dead_code)] - pub fn irqfd_event(&self) -> Option<&Event> { - self.irqfd_event.as_ref() - } - - /// Signals the interrupt if it is enabled. - #[expect(clippy::assertions_on_constants)] - #[expect(dead_code)] - pub fn signal(&self) { - // Use a relaxed atomic read to avoid extra synchronization in this - // path. It's up to callers to synchronize this with `enable`/`disable` - // if strict ordering is necessary. - if self.enabled.load(Ordering::Relaxed) { - if let Some(event) = &self.irqfd_event { - event.signal(); - } else if let Some(partition) = self.partition.upgrade() { - // TODO: `gsi` must include certain flags on aarch64 to indicate - // the type of the interrupt: SPI or PPI handled by the in-kernel vGIC, - // or the user mode GIC emulator (where have to specify the target VP, too). - - assert!(cfg!(guest_arch = "x86_64")); partition .kvm - .irq_line(self.gsi, true) - .expect("interrupt delivery failure"); + .irqfd(self.gsi, irqfd_event.as_fd().as_raw_fd(), false) + .expect("should not fail"); } + self.enabled.store(false, Ordering::Relaxed); } } } -/// irqfd routing interface for a KVM partition. -/// -/// Wraps the existing [`GsiRoute`] infrastructure to implement the -/// [`IrqFd`]/[`IrqFdRoute`] traits. -pub struct KvmIrqFdState { - partition: Arc, +pub(crate) struct KvmIrqFdState { + pub(crate) partition: Arc, } impl KvmIrqFdState { pub fn new(partition: Arc) -> Self { Self { partition } } -} -impl IrqFd for KvmIrqFdState { - fn new_irqfd_route(&self) -> anyhow::Result> { + pub fn new_irqfd_route( + &self, + builder: T, + ) -> anyhow::Result> { let event = Event::new(); let route = self .partition .new_route(Some(event.clone())) .context("no free GSIs available for irqfd")?; - Ok(Box::new(KvmIrqFdRoute { route, event })) + Ok(KvmIrqFdRoute { + builder, + route, + event, + }) } } @@ -234,35 +204,45 @@ impl IrqFd for KvmIrqFdState { /// /// Cleanup (disable irqfd, clear route, free GSI) is handled by /// [`GsiRoute::drop`]. -struct KvmIrqFdRoute { +pub(crate) struct KvmIrqFdRoute { + builder: T, route: GsiRoute, event: Event, } -impl IrqFdRoute for KvmIrqFdRoute { +pub(crate) trait MsiRouteBuilder: Send + Sync { + fn routing_entry( + &self, + partition: &KvmPartitionInner, + address: u64, + data: u32, + ) -> Option; +} + +impl IrqFdRoute for KvmIrqFdRoute { fn event(&self) -> &Event { &self.event } fn enable(&self, address: u64, data: u32) { - let crate::arch::KvmMsi { - address_lo, - address_hi, - data, - } = crate::arch::KvmMsi::new(virt::irqcon::MsiRequest { address, data }); - let entry = kvm::RoutingEntry::Msi { - address_lo, - address_hi, - data, - }; - self.route.enable(entry); + if let Some(partition) = self.route.partition.upgrade() { + if let Some(entry) = self.builder.routing_entry(&partition, address, data) { + self.route.inner.enable(&partition, entry); + } else { + tracelimit::warn_ratelimited!( + address, + data, + "failed to build irqfd interrupt route" + ); + self.route.inner.disable(&partition); + self.route.inner.set_entry(&partition, None); + } + } } fn disable(&self) { - // Just disarm the irqfd. The routing entry is inert without an - // armed eventfd, so there's no need to remove it and trigger an - // expensive KVM_SET_GSI_ROUTING ioctl. On re-enable, if the - // address/data haven't changed, set_entry will be a no-op. - self.route.disable(); + if let Some(partition) = self.route.partition.upgrade() { + self.route.inner.disable(&partition); + } } } diff --git a/vmm_core/virt_kvm/src/lib.rs b/vmm_core/virt_kvm/src/lib.rs index 1b8467fc49..91609f3214 100644 --- a/vmm_core/virt_kvm/src/lib.rs +++ b/vmm_core/virt_kvm/src/lib.rs @@ -10,7 +10,6 @@ #![expect(clippy::undocumented_unsafe_blocks)] mod arch; -#[cfg(guest_arch = "x86_64")] mod gsi; pub use arch::Kvm; @@ -86,9 +85,8 @@ pub struct KvmPartition { inner: Arc, #[inspect(skip)] synic_ports: Arc>, - #[cfg(guest_arch = "x86_64")] #[inspect(skip)] - irqfd_state: Arc, + irqfd_state: Arc, } #[derive(Inspect)] @@ -100,7 +98,6 @@ struct KvmPartitionInner { gm: GuestMemory, #[inspect(skip)] vps: Vec, - #[cfg(guest_arch = "x86_64")] #[inspect(skip)] gsi_routing: Mutex, caps: virt::PartitionCapabilities, From d99d72eee07884c9990f15b5b15ee9dc45b3f285 Mon Sep 17 00:00:00 2001 From: John Starks Date: Thu, 7 May 2026 17:15:27 -0700 Subject: [PATCH 2/3] comments --- vm/vmcore/src/irqfd.rs | 17 ++++++++--------- vmm_core/virt_kvm/src/gsi.rs | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/vm/vmcore/src/irqfd.rs b/vm/vmcore/src/irqfd.rs index 514ee03fb3..97d3e6219e 100644 --- a/vm/vmcore/src/irqfd.rs +++ b/vm/vmcore/src/irqfd.rs @@ -3,21 +3,20 @@ //! Traits for irqfd-based interrupt delivery. //! -//! irqfd allows a hypervisor to directly inject an interrupt into a guest when -//! an event is signaled, without involving userspace in the interrupt delivery +//! irqfd allows a hypervisor to directly inject an MSI into a guest when an +//! event is signaled, without involving userspace in the interrupt delivery //! path. This is used for device passthrough (e.g., VFIO) where the physical -//! device signals an event and the hypervisor injects the corresponding -//! interrupt into the guest VM. +//! device signals an event and the hypervisor injects the corresponding MSI +//! into the guest VM. use pal_event::Event; /// Trait for partitions that support irqfd-based interrupt delivery. /// /// An irqfd associates an event with a GSI (Global System Interrupt), and a -/// GSI routing table maps GSIs to architecture-specific interrupt message -/// routes. When the event is signaled, the kernel looks up the GSI routing and -/// injects the configured interrupt into the guest without a usermode -/// transition. +/// GSI routing table maps GSIs to MSI addresses and data values. When the +/// event is signaled, the kernel looks up the GSI routing and injects the +/// configured MSI into the guest without a usermode transition. pub trait IrqFd: Send + Sync { /// Creates a new irqfd route. /// @@ -48,7 +47,7 @@ pub trait IrqFdRoute: Send + Sync { /// is the event handle returned by `WHvCreateTrigger`. fn event(&self) -> &Event; - /// Sets the interrupt routing for this irqfd's GSI. + /// Sets the MSI routing for this irqfd's GSI. /// /// `address` and `data` are the MSI address and data values that the /// hypervisor will use when injecting the interrupt into the guest. diff --git a/vmm_core/virt_kvm/src/gsi.rs b/vmm_core/virt_kvm/src/gsi.rs index d6c4cbf1da..f070d81311 100644 --- a/vmm_core/virt_kvm/src/gsi.rs +++ b/vmm_core/virt_kvm/src/gsi.rs @@ -159,7 +159,7 @@ impl GsiRouteInner { /// Disables the associated irqfd. /// /// This actually leaves the route configured, but it disables the irqfd and - /// clears the `enabled` bool so that `signal` won't. + /// clears the `enabled` flag. pub fn disable(&self, partition: &KvmPartitionInner) { let _lock = self.enable_mutex.lock(); if self.enabled.load(Ordering::Relaxed) { From 89af3c83f5d328fc5ac25619fabce4e59179f634 Mon Sep 17 00:00:00 2001 From: John Starks Date: Thu, 7 May 2026 17:16:44 -0700 Subject: [PATCH 3/3] clippy --- vmm_core/virt_kvm/src/arch/aarch64/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs index 3b21f9a9ae..e6788b7038 100644 --- a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs @@ -842,9 +842,8 @@ impl virt::Partition for KvmPartition { } fn irqfd(&self) -> Option> { - if self.inner.gic_v2m.is_none() { - return None; - } + // The irqfd implementation requires a GICv2m frame to be present. + self.inner.gic_v2m?; Some(self.irqfd_state.clone()) }