From 06ff46ae3a28c05ef63834bfdf8eee6ba248a42c Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 15:03:26 -0700 Subject: [PATCH 01/28] initial work --- vm/devices/net/gdma/src/bnic.rs | 1 + vm/devices/net/net_backend/src/lib.rs | 11 +- vm/devices/net/net_backend/src/null.rs | 1 + .../net/net_consomme/consomme/src/lib.rs | 8 + .../net/net_consomme/consomme/src/udp.rs | 19 +- .../net/net_consomme/consomme/src/unix.rs | 170 +++++++++++++++++- .../net/net_consomme/consomme/src/windows.rs | 74 ++++++++ vm/devices/net/net_consomme/src/lib.rs | 5 + vm/devices/virtio/virtio_net/src/lib.rs | 47 +++-- vm/devices/virtio/virtio_net/src/tests.rs | 78 ++++++++ 10 files changed, 390 insertions(+), 24 deletions(-) diff --git a/vm/devices/net/gdma/src/bnic.rs b/vm/devices/net/gdma/src/bnic.rs index 6653e2f7ff..0fe74b8004 100644 --- a/vm/devices/net/gdma/src/bnic.rs +++ b/vm/devices/net/gdma/src/bnic.rs @@ -545,6 +545,7 @@ impl TxRxTask { l3_len: oob.s_oob.trans_off().clamp(14, 255) - 14, l4_len: 0, max_tcp_segment_size: 0, + max_udp_segment_size: 0, }; if sqe.header.params.client_oob_in_sgl() { diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index bf74883b40..fe8d937c81 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -130,6 +130,8 @@ pub struct TxOffloadSupport { pub udp: bool, /// TCP segmentation offload. pub tso: bool, + /// UDP segmentation offload (UFO). + pub ufo: bool, } #[derive(Debug, Clone)] @@ -329,6 +331,9 @@ pub struct TxMetadata { /// The maximum TCP segment size, used for segmentation. Only guaranteed to /// be set if [`TxFlags::offload_tcp_segmentation`] is set. pub max_tcp_segment_size: u16, + /// The maximum UDP segment size, used for UDP segmentation offload. Only + /// guaranteed to be set if [`TxFlags::offload_udp_segmentation`] is set. + pub max_udp_segment_size: u16, } /// Flags affecting transmit behavior. @@ -356,7 +361,10 @@ pub struct TxFlags { pub is_ipv4: bool, /// If true, the packet is IPv6. Mutually exclusive with `is_ipv4`. pub is_ipv6: bool, - #[bits(2)] + /// Offload UDP segmentation (UFO), allowing UDP packets larger than the + /// MTU. `l2_len`, `l3_len`, and `max_udp_segment_size` must be set. + pub offload_udp_segmentation: bool, + #[bits(1)] _reserved: u8, } @@ -371,6 +379,7 @@ impl Default for TxMetadata { l3_len: 0, l4_len: 0, max_tcp_segment_size: 0, + max_udp_segment_size: 0, } } } diff --git a/vm/devices/net/net_backend/src/null.rs b/vm/devices/net/net_backend/src/null.rs index fa09077f81..6d02215d4e 100644 --- a/vm/devices/net/net_backend/src/null.rs +++ b/vm/devices/net/net_backend/src/null.rs @@ -85,6 +85,7 @@ impl Endpoint for NullEndpoint { tcp: true, udp: true, tso: true, + ufo: true, } } diff --git a/vm/devices/net/net_consomme/consomme/src/lib.rs b/vm/devices/net/net_consomme/consomme/src/lib.rs index 806516ce91..63a59636e1 100644 --- a/vm/devices/net/net_consomme/consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/consomme/src/lib.rs @@ -295,6 +295,9 @@ pub struct ChecksumState { /// /// The IP header's length field may be invalid and should be ignored. pub tso: Option, + /// The data is a large UDP payload that should be sent with OS-level UDP + /// GSO, splitting into UDP datagrams of this segment size. + pub gso: Option, } impl ChecksumState { @@ -303,30 +306,35 @@ impl ChecksumState { tcp: false, udp: false, tso: None, + gso: None, }; const IPV4_ONLY: Self = Self { ipv4: true, tcp: false, udp: false, tso: None, + gso: None, }; const TCP4: Self = Self { ipv4: true, tcp: true, udp: false, tso: None, + gso: None, }; const UDP4: Self = Self { ipv4: true, tcp: false, udp: true, tso: None, + gso: None, }; const TCP6: Self = Self { ipv4: false, tcp: true, udp: false, tso: None, + gso: None, }; fn caps(&self) -> ChecksumCapabilities { diff --git a/vm/devices/net/net_consomme/consomme/src/udp.rs b/vm/devices/net/net_consomme/consomme/src/udp.rs index 80ebda3ccd..2279a15ad4 100644 --- a/vm/devices/net/net_consomme/consomme/src/udp.rs +++ b/vm/devices/net/net_consomme/consomme/src/udp.rs @@ -55,6 +55,11 @@ use std::time::Instant; use crate::DNS_PORT; +#[cfg(unix)] +use crate::unix as platform; +#[cfg(windows)] +use crate::windows as platform; + pub(crate) struct Udp { connections: HashMap, timeout: Duration, @@ -299,13 +304,13 @@ impl Access<'_, T> { }; let conn = self.get_or_insert(guest_addr, Some(frame.src_addr))?; - match conn - .socket - .as_mut() - .unwrap() - .get() - .send_to(udp_packet.payload(), dst_sock_addr) - { + let socket = conn.socket.as_mut().unwrap().get(); + let result = if let Some(seg_size) = checksum.gso { + platform::send_udp_with_gso(socket, udp_packet.payload(), &dst_sock_addr, seg_size) + } else { + socket.send_to(udp_packet.payload(), dst_sock_addr) + }; + match result { Ok(_) => { conn.stats.tx_packets.increment(); conn.last_activity = Instant::now(); diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 3a73dcc64f..29a8a924fe 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -2,16 +2,176 @@ // Licensed under the MIT License. #![cfg(unix)] -//! Unix implementation of host IPv6 address detection. +//! Unix platform helpers for consomme. //! -//! Uses `getifaddrs()` from libc to check if the host has any non-link-local -//! IPv6 unicast addresses assigned. +//! - IPv6 address detection via `getifaddrs()`. +//! - UDP GSO batch send: +//! - Linux: `sendmsg(2)` + `UDP_SEGMENT` cmsg (kernel segmentation). +//! - macOS: `sendmsg_x()` private API (user-space segments, one syscall). +//! - Other Unix: software loop over `send_to()`. -// UNSAFETY: Calling libc getifaddrs/freeifaddrs and walking the resulting -// linked list of interface addresses. +// UNSAFETY: getifaddrs/freeifaddrs; sendmsg with a manually built msghdr; +// sendmsg_x (private Apple API) with a manually built msghdr_x array. #![expect(unsafe_code)] use std::net::Ipv6Addr; +use std::net::SocketAddr; +use std::net::UdpSocket; + +/// Send `data` as a UDP GSO batch, splitting into datagrams of `seg_size` +/// bytes each. +/// +/// - **Linux**: one `sendmsg(2)` call with a `UDP_SEGMENT` control message; +/// the kernel (or NIC driver) performs the segmentation. +/// - **macOS**: one `sendmsg_x()` call (private Apple API) with one +/// `msghdr_x` entry per segment; user-space segments but a single syscall. +/// - **Other Unix**: software loop — one `send_to()` call per segment. +#[cfg(target_os = "linux")] +pub fn send_udp_with_gso( + socket: &UdpSocket, + data: &[u8], + dst: &SocketAddr, + seg_size: u16, +) -> std::io::Result { + use std::mem::size_of; + use std::os::unix::io::AsRawFd; + + let sockaddr = socket2::SockAddr::from(*dst); + let iov = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: data.len(), + }; + let cmsg_space = + // SAFETY: computing the buffer size for a single u16 cmsg. + unsafe { libc::CMSG_SPACE(size_of::() as u32) as usize }; + let mut cmsg_buf = vec![0u8; cmsg_space]; + // Use zeroed() + field assignment rather than struct literal syntax: + // musl's msghdr has private padding fields on 64-bit targets that make + // struct literal initialization fail to compile. + // SAFETY: all-zero is a valid initializer for msghdr. + let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; + msg.msg_name = sockaddr.as_ptr() as *mut libc::c_void; + msg.msg_namelen = sockaddr.len(); + msg.msg_iov = &iov as *const libc::iovec as *mut libc::iovec; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf.as_mut_ptr() as *mut libc::c_void; + msg.msg_controllen = cmsg_space as _; + // SAFETY: msg_control and msg_controllen point to our allocated buffer, + // which is large enough for a single u16 UDP_SEGMENT control message. + let cmsg = unsafe { &mut *libc::CMSG_FIRSTHDR(&msg) }; + cmsg.cmsg_level = libc::IPPROTO_UDP; + cmsg.cmsg_type = libc::UDP_SEGMENT; + cmsg.cmsg_len = + // SAFETY: computing the cmsg_len for a single u16 data field. + unsafe { libc::CMSG_LEN(size_of::() as u32) as _ }; + // SAFETY: writing a u16 into the CMSG data area, which is correctly + // sized for a u16 payload. + unsafe { *(libc::CMSG_DATA(cmsg) as *mut u16) = seg_size }; + + // SAFETY: calling sendmsg(2) with a correctly constructed msghdr. + let ret = unsafe { libc::sendmsg(socket.as_raw_fd(), &msg, 0) }; + if ret < 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(ret as usize) + } +} + +/// macOS batch send using the private `sendmsg_x()` API. +/// +/// `sendmsg_x()` and `msghdr_x` are undocumented Apple extensions (present +/// since macOS 10.11) that allow sending multiple datagrams in a single +/// syscall. `msghdr_x` is identical to the standard `msghdr` except for an +/// extra `msg_datalen` field that records the byte count for each entry. +/// `sendmsg_x` returns the number of messages queued, not bytes. +/// +/// This gives us user-space segmentation with a single syscall, rather than +/// one syscall per segment. +#[cfg(target_os = "macos")] +pub fn send_udp_with_gso( + socket: &UdpSocket, + data: &[u8], + dst: &SocketAddr, + seg_size: u16, +) -> std::io::Result { + use std::os::unix::io::AsRawFd; + + // Private Apple extension of msghdr: identical layout up to msg_flags, + // then an extra msg_datalen field that holds the per-entry byte count. + #[repr(C)] + struct MsghdrX { + msg_name: *mut libc::c_void, + msg_namelen: libc::socklen_t, + msg_iov: *mut libc::iovec, + msg_iovlen: libc::c_int, + msg_control: *mut libc::c_void, + msg_controllen: libc::socklen_t, + msg_flags: libc::c_int, + msg_datalen: libc::size_t, + } + + unsafe extern "C" { + /// Batch-send `cnt` datagrams described by `msgp[0..cnt]`. + /// Returns the number of messages queued, or -1 on error. + fn sendmsg_x( + s: libc::c_int, + msgp: *const MsghdrX, + cnt: libc::c_uint, + flags: libc::c_int, + ) -> isize; + } + + let sockaddr = socket2::SockAddr::from(*dst); + let seg_size = seg_size as usize; + + // Build one iovec per segment. Collected up front so the Vec's heap + // allocation is stable before we take raw pointers into it. + let iovecs: Vec = data + .chunks(seg_size) + .map(|chunk| libc::iovec { + iov_base: chunk.as_ptr() as *mut libc::c_void, + iov_len: chunk.len(), + }) + .collect(); + + // Build a matching msghdr_x per segment. Each entry shares the same + // destination address and points to its own iovec. + let hdrs: Vec = iovecs + .iter() + .map(|iov| MsghdrX { + msg_name: sockaddr.as_ptr() as *mut libc::c_void, + msg_namelen: sockaddr.len(), + msg_iov: iov as *const libc::iovec as *mut libc::iovec, + msg_iovlen: 1, + msg_control: std::ptr::null_mut(), + msg_controllen: 0, + msg_flags: 0, + msg_datalen: iov.iov_len, + }) + .collect(); + + // SAFETY: sendmsg_x reads hdrs[0..hdrs.len()]. Each entry holds a valid + // pointer into iovecs (stable for the duration of this call) and a + // borrowed pointer to sockaddr (also live for this call). hdrs is passed + // as a non-null, correctly sized slice. + let sent = unsafe { + sendmsg_x( + socket.as_raw_fd(), + hdrs.as_ptr(), + hdrs.len() as libc::c_uint, + 0, + ) + }; + + if sent < 0 { + return Err(std::io::Error::last_os_error()); + } + + // sendmsg_x returns the number of messages queued. Sum the byte counts of + // the successfully sent entries to produce the total byte count. + Ok(iovecs[..sent as usize].iter().map(|iov| iov.iov_len).sum()) +} + /// Checks whether the host has at least one non-link-local, non-loopback /// IPv6 unicast address assigned. diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index 7842b1ebda..d15627216b 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -7,7 +7,10 @@ #![expect(unsafe_code)] use socket2::Socket; +use std::mem::size_of; use std::net::Ipv6Addr; +use std::net::SocketAddr; +use std::net::UdpSocket; use std::os::windows::io::AsRawSocket; use std::ptr::null_mut; use windows_sys::Win32::Foundation::ERROR_SUCCESS; @@ -95,3 +98,74 @@ pub fn host_has_ipv6_address() -> Result { Ok(has_ipv6) } + +/// Send `data` as a UDP GSO batch using `WSASendMsg` with a +/// `UDP_SEND_MSG_SIZE` control message so the Windows network stack splits +/// it into datagrams of `seg_size` bytes each. +pub fn send_udp_with_gso( + socket: &UdpSocket, + data: &[u8], + dst: &SocketAddr, + seg_size: u16, +) -> std::io::Result { + // UDP_SEND_MSG_SIZE tells WSASendMsg the per-segment size. + const UDP_SEND_MSG_SIZE: i32 = 2; + + let sockaddr = socket2::SockAddr::from(*dst); + let seg_size_dword = seg_size as u32; + + let buf = WinSock::WSABUF { + len: data.len() as u32, + buf: data.as_ptr() as *mut u8, + }; + + let cmsg_space = + // SAFETY: computing the buffer size for a single u32 WSA cmsg. + unsafe { WinSock::WSA_CMSG_SPACE(size_of::() as u32) as usize }; + let mut cmsg_buf = vec![0u8; cmsg_space]; + + let wsamsg = WinSock::WSAMSG { + name: sockaddr.as_ptr() as *mut WinSock::SOCKADDR, + namelen: sockaddr.len() as i32, + lpBuffers: &buf as *const WinSock::WSABUF as *mut WinSock::WSABUF, + dwBufferCount: 1, + Control: WinSock::WSABUF { + buf: cmsg_buf.as_mut_ptr(), + len: cmsg_space as u32, + }, + dwFlags: 0, + }; + + // SAFETY: filling the WSAMSG control buffer per the WSA_CMSG documentation. + let cmsg = unsafe { &mut *WinSock::WSA_CMSG_FIRSTHDR(&wsamsg) }; + cmsg.cmsg_level = WinSock::IPPROTO_UDP as i32; + cmsg.cmsg_type = UDP_SEND_MSG_SIZE; + cmsg.cmsg_len = + // SAFETY: computing cmsg_len for a single u32 data field. + unsafe { WinSock::WSA_CMSG_LEN(size_of::() as u32) }; + // SAFETY: writing a u32 into the CMSG data area, which is correctly + // sized for a u32 payload. + unsafe { *(WinSock::WSA_CMSG_DATA(cmsg) as *mut u32) = seg_size_dword }; + + let mut bytes_sent = 0u32; + // SAFETY: calling WSASendMsg with a correctly constructed WSAMSG. + let ret = unsafe { + WinSock::WSASendMsg( + socket.as_raw_socket() as WinSock::SOCKET, + &wsamsg, + 0, + &mut bytes_sent, + null_mut(), + None, + ) + }; + + if ret == WinSock::SOCKET_ERROR { + Err(std::io::Error::from_raw_os_error( + // SAFETY: WSAGetLastError is safe to call after a socket error. + unsafe { WinSock::WSAGetLastError() } as i32, + )) + } else { + Ok(bytes_sent as usize) + } +} diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index fae9794736..1d05483f83 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -217,6 +217,7 @@ impl net_backend::Endpoint for ConsommeEndpoint { tcp: true, udp: true, tso: true, + ufo: true, } } } @@ -332,6 +333,10 @@ impl net_backend::Queue for ConsommeQueue { .flags .offload_tcp_segmentation() .then_some(meta.max_tcp_segment_size), + gso: meta + .flags + .offload_udp_segmentation() + .then_some(meta.max_udp_segment_size), }; let mut buf = vec![0; meta.len as usize]; diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index a8a580a413..1eff332f31 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -262,13 +262,16 @@ impl VirtioDevice for Device { // must compute per-segment IPv4 header checksums. let host_tso4 = offloads.tso && offloads.tcp && offloads.ipv4_header; let host_tso6 = offloads.tso && offloads.tcp; + // VIRTIO_NET_F_HOST_UFO: we can handle UDP segmentation from the guest. + let host_ufo = offloads.ufo && offloads.udp; let features_bank0 = NetworkFeaturesBank0::new() .with_mac(true) .with_csum(csum) .with_guest_csum(true) .with_host_tso4(host_tso4) - .with_host_tso6(host_tso6); + .with_host_tso6(host_tso6) + .with_host_ufo(host_ufo); DeviceTraits { device_id: 1, @@ -1055,6 +1058,7 @@ impl Worker { let mut l3_len: u16 = 0; let mut l4_len: u8 = 0; let mut max_tcp_segment_size: u16 = 0; + let mut max_udp_segment_size: u16 = 0; // Determine IP version from GSO type when available. let is_ipv4_from_gso = gso_protocol == VirtioNetHeaderGsoProtocol::TCPV4; @@ -1118,10 +1122,11 @@ impl Worker { } // GSO (segmentation offload) — only honor if the corresponding - // HOST_TSO feature was negotiated. + // HOST_TSO/HOST_UFO feature was negotiated. let gso_enabled = match gso_protocol { VirtioNetHeaderGsoProtocol::TCPV4 => features.host_tso4(), VirtioNetHeaderGsoProtocol::TCPV6 => features.host_tso6(), + VirtioNetHeaderGsoProtocol::UDP => features.host_ufo(), _ => false, }; if gso_enabled { @@ -1136,6 +1141,8 @@ impl Worker { l3_len = header.csum_start - l2_len as u16; } + let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP; + // Derive l4_len from hdr_len if available: // hdr_len = l2_len + l3_len + l4_len (total header length) let total_hdr = header.hdr_len as u32; @@ -1147,16 +1154,33 @@ impl Worker { } } - // Only enable segmentation if we derived valid header lengths. - if l3_len > 0 && l4_len > 0 { - flags.set_offload_tcp_segmentation(true); - flags.set_offload_tcp_checksum(true); - flags.set_offload_udp_checksum(false); - max_tcp_segment_size = header.gso_size; + // For UDP GSO, hdr_len==0 is acceptable (fixed 8-byte header). + // For TCP GSO, we need a valid l4_len to proceed. + let valid_lengths = + l3_len > 0 && (l4_len > 0 || (is_udp && header.hdr_len == 0)); + + if valid_lengths { + if is_udp { + flags.set_offload_udp_segmentation(true); + // Guest omits the UDP checksum for GSO packets; ask + // the backend to fill it in. + flags.set_offload_udp_checksum(true); + flags.set_offload_tcp_checksum(false); + max_udp_segment_size = header.gso_size; + } else { + flags.set_offload_tcp_segmentation(true); + flags.set_offload_tcp_checksum(true); + flags.set_offload_udp_checksum(false); + max_tcp_segment_size = header.gso_size; + } - flags.set_is_ipv4(is_ipv4_from_gso); - flags.set_is_ipv6(is_ipv6_from_gso); - if is_ipv4_from_gso { + // UDP GSO has no separate TCPV4/TCPV6 variants, + // so derive IP version from EtherType; TCP uses GSO type. + let is_ipv4 = if is_udp { is_ipv4_from_eth } else { is_ipv4_from_gso }; + let is_ipv6 = if is_udp { is_ipv6_from_eth } else { is_ipv6_from_gso }; + flags.set_is_ipv4(is_ipv4); + flags.set_is_ipv6(is_ipv6); + if is_ipv4 { flags.set_offload_ip_header_checksum(true); } } @@ -1169,6 +1193,7 @@ impl Worker { l3_len, l4_len, max_tcp_segment_size, + max_udp_segment_size, ..Default::default() } } diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index 2d06ef658e..790b692ac8 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -1127,6 +1127,13 @@ fn features_tso() -> NetworkFeaturesBank0 { .with_host_tso6(true) } +/// Features with CSUM + HOST_UFO enabled (for UFO tests). +fn features_ufo() -> NetworkFeaturesBank0 { + NetworkFeaturesBank0::new() + .with_csum(true) + .with_host_ufo(true) +} + /// Build a minimal Ethernet header (14 bytes) with the given EtherType. fn make_eth_header(ethertype: u16) -> [u8; 14] { let mut h = [0u8; 14]; @@ -1356,6 +1363,76 @@ fn tx_offload_tso_without_needs_csum() { assert_eq!(meta.max_tcp_segment_size, 1460); } +/// UFO4: gso_type=UDP with IPv4 EtherType. +#[test] +fn tx_offload_ufo4() { + // IPv4 UFO: 14 (L2) + 20 (L3) + 8 (UDP) = 42 byte header + let header = make_virtio_header( + true, + VirtioNetHeaderGsoProtocol::UDP, + 42, // hdr_len + 1472, // gso_size (UDP payload per segment) + 34, // csum_start (14 + 20) + 6, // UDP csum offset + ); + let meta = Worker::parse_tx_offloads( + Some(&header), + &make_eth_header(ETHERTYPE_IPV4), + 65000, + features_ufo(), + ); + assert!( + meta.flags.offload_udp_segmentation(), + "UFO should be set" + ); + assert!( + meta.flags.offload_udp_checksum(), + "UDP csum should be set for UFO" + ); + assert!( + meta.flags.offload_ip_header_checksum(), + "IPv4 header csum should be set" + ); + assert!(meta.flags.is_ipv4()); + assert!(!meta.flags.is_ipv6()); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 20); + assert_eq!(meta.max_udp_segment_size, 1472); + assert!(!meta.flags.offload_tcp_segmentation()); +} + +/// UFO6: gso_type=UDP with IPv6 EtherType. +#[test] +fn tx_offload_ufo6() { + // IPv6 UFO: 14 (L2) + 40 (L3) + 8 (UDP) = 62 byte header + let header = make_virtio_header( + true, + VirtioNetHeaderGsoProtocol::UDP, + 62, // hdr_len + 1452, // gso_size + 54, // csum_start (14 + 40) + 6, // UDP csum offset + ); + let meta = Worker::parse_tx_offloads( + Some(&header), + &make_eth_header(ETHERTYPE_IPV6), + 65000, + features_ufo(), + ); + assert!(meta.flags.offload_udp_segmentation(), "UFO should be set"); + assert!(meta.flags.offload_udp_checksum()); + assert!( + !meta.flags.offload_ip_header_checksum(), + "no IPv4 header csum for IPv6" + ); + assert!(!meta.flags.is_ipv4()); + assert!(meta.flags.is_ipv6()); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 40); + assert_eq!(meta.max_udp_segment_size, 1452); + assert!(!meta.flags.offload_tcp_segmentation()); +} + /// VLAN-tagged frame: 802.1Q tag (EtherType 0x8100) wrapping IPv4. #[test] fn tx_offload_vlan_ipv4() { @@ -1635,6 +1712,7 @@ async fn feature_negotiation_with_offloads(driver: DefaultDriver) { tcp: true, udp: true, tso: true, + ufo: false, }, }; From 588ad0f2ebf7757909df942e620c2370c3af0eee Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 16:24:11 -0700 Subject: [PATCH 02/28] . --- .../net/net_consomme/consomme/src/udp.rs | 11 +- .../net/net_consomme/consomme/src/unix.rs | 103 ++++++++--------- .../net/net_consomme/consomme/src/windows.rs | 107 +++++++----------- 3 files changed, 102 insertions(+), 119 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/udp.rs b/vm/devices/net/net_consomme/consomme/src/udp.rs index 2279a15ad4..82243fe693 100644 --- a/vm/devices/net/net_consomme/consomme/src/udp.rs +++ b/vm/devices/net/net_consomme/consomme/src/udp.rs @@ -95,6 +95,9 @@ struct UdpConnection { recycle: bool, #[inspect(debug)] last_activity: Instant, + /// The UDP GSO segment size currently configured on the socket (0 = disabled). + /// Tracked to avoid redundant setsockopt calls on every packet. + gso_size: u16, } #[derive(Inspect, Default)] @@ -304,7 +307,12 @@ impl Access<'_, T> { }; let conn = self.get_or_insert(guest_addr, Some(frame.src_addr))?; - let socket = conn.socket.as_mut().unwrap().get(); + let socket = conn.socket.as_ref().unwrap().get(); + let new_gso = checksum.gso.unwrap_or(0); + if conn.gso_size != new_gso { + platform::set_udp_gso_size(socket, new_gso).map_err(DropReason::Io)?; + conn.gso_size = new_gso; + } let result = if let Some(seg_size) = checksum.gso { platform::send_udp_with_gso(socket, udp_packet.payload(), &dst_sock_addr, seg_size) } else { @@ -354,6 +362,7 @@ impl Access<'_, T> { stats: Default::default(), recycle: false, last_activity: Instant::now(), + gso_size: 0, }; Ok(e.insert(conn)) } diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 29a8a924fe..eafc450c79 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -6,11 +6,12 @@ //! //! - IPv6 address detection via `getifaddrs()`. //! - UDP GSO batch send: -//! - Linux: `sendmsg(2)` + `UDP_SEGMENT` cmsg (kernel segmentation). +//! - Linux: `setsockopt(IPPROTO_UDP, UDP_SEGMENT)` sets the segment size +//! once per connection; subsequent `send_to()` calls are segmented by the +//! kernel automatically. //! - macOS: `sendmsg_x()` private API (user-space segments, one syscall). -//! - Other Unix: software loop over `send_to()`. -// UNSAFETY: getifaddrs/freeifaddrs; sendmsg with a manually built msghdr; +// UNSAFETY: getifaddrs/freeifaddrs; setsockopt for UDP_SEGMENT (Linux); // sendmsg_x (private Apple API) with a manually built msghdr_x array. #![expect(unsafe_code)] @@ -18,63 +19,57 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::UdpSocket; -/// Send `data` as a UDP GSO batch, splitting into datagrams of `seg_size` -/// bytes each. +/// Configure the UDP GSO segment size on `socket`. /// -/// - **Linux**: one `sendmsg(2)` call with a `UDP_SEGMENT` control message; -/// the kernel (or NIC driver) performs the segmentation. -/// - **macOS**: one `sendmsg_x()` call (private Apple API) with one -/// `msghdr_x` entry per segment; user-space segments but a single syscall. -/// - **Other Unix**: software loop — one `send_to()` call per segment. +/// On Linux this calls `setsockopt(IPPROTO_UDP, UDP_SEGMENT, size)`, which +/// persists for the lifetime of the connection. When `size` is 0 the option +/// is cleared and normal (non-GSO) sends resume. On macOS `sendmsg_x` conveys +/// the segment size per-call, so this is a no-op. #[cfg(target_os = "linux")] -pub fn send_udp_with_gso( - socket: &UdpSocket, - data: &[u8], - dst: &SocketAddr, - seg_size: u16, -) -> std::io::Result { +pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { use std::mem::size_of; use std::os::unix::io::AsRawFd; - let sockaddr = socket2::SockAddr::from(*dst); - let iov = libc::iovec { - iov_base: data.as_ptr() as *mut libc::c_void, - iov_len: data.len(), + // SAFETY: setsockopt with a valid u16 optval per Linux udp(7) documentation + // for UDP_SEGMENT. + let ret = unsafe { + libc::setsockopt( + socket.as_raw_fd(), + libc::IPPROTO_UDP, + libc::UDP_SEGMENT, + std::ptr::from_ref(&size).cast::(), + size_of::() as libc::socklen_t, + ) }; - let cmsg_space = - // SAFETY: computing the buffer size for a single u16 cmsg. - unsafe { libc::CMSG_SPACE(size_of::() as u32) as usize }; - let mut cmsg_buf = vec![0u8; cmsg_space]; - // Use zeroed() + field assignment rather than struct literal syntax: - // musl's msghdr has private padding fields on 64-bit targets that make - // struct literal initialization fail to compile. - // SAFETY: all-zero is a valid initializer for msghdr. - let mut msg: libc::msghdr = unsafe { std::mem::zeroed() }; - msg.msg_name = sockaddr.as_ptr() as *mut libc::c_void; - msg.msg_namelen = sockaddr.len(); - msg.msg_iov = &iov as *const libc::iovec as *mut libc::iovec; - msg.msg_iovlen = 1; - msg.msg_control = cmsg_buf.as_mut_ptr() as *mut libc::c_void; - msg.msg_controllen = cmsg_space as _; - // SAFETY: msg_control and msg_controllen point to our allocated buffer, - // which is large enough for a single u16 UDP_SEGMENT control message. - let cmsg = unsafe { &mut *libc::CMSG_FIRSTHDR(&msg) }; - cmsg.cmsg_level = libc::IPPROTO_UDP; - cmsg.cmsg_type = libc::UDP_SEGMENT; - cmsg.cmsg_len = - // SAFETY: computing the cmsg_len for a single u16 data field. - unsafe { libc::CMSG_LEN(size_of::() as u32) as _ }; - // SAFETY: writing a u16 into the CMSG data area, which is correctly - // sized for a u16 payload. - unsafe { *(libc::CMSG_DATA(cmsg) as *mut u16) = seg_size }; - - // SAFETY: calling sendmsg(2) with a correctly constructed msghdr. - let ret = unsafe { libc::sendmsg(socket.as_raw_fd(), &msg, 0) }; - if ret < 0 { - Err(std::io::Error::last_os_error()) - } else { - Ok(ret as usize) + if ret != 0 { + return Err(std::io::Error::last_os_error()); } + Ok(()) +} + +/// Configure the UDP GSO segment size on `socket`. +/// +/// On macOS the segment size is conveyed per-send via `sendmsg_x`, so there +/// is nothing to configure on the socket itself. This is a no-op. +#[cfg(target_os = "macos")] +pub fn set_udp_gso_size(_socket: &UdpSocket, _size: u16) -> std::io::Result<()> { + Ok(()) +} + +/// Send `data` as a UDP GSO batch. +/// +/// On Linux, `UDP_SEGMENT` must already be configured on the socket via +/// [`set_udp_gso_size`]. The kernel then automatically splits the outgoing +/// buffer into datagrams of that size. The `seg_size` parameter is accepted +/// for API uniformity with the macOS implementation but is not used here. +#[cfg(target_os = "linux")] +pub fn send_udp_with_gso( + socket: &UdpSocket, + data: &[u8], + dst: &SocketAddr, + _seg_size: u16, +) -> std::io::Result { + socket.send_to(data, *dst) } /// macOS batch send using the private `sendmsg_x()` API. @@ -141,7 +136,7 @@ pub fn send_udp_with_gso( .map(|iov| MsghdrX { msg_name: sockaddr.as_ptr() as *mut libc::c_void, msg_namelen: sockaddr.len(), - msg_iov: iov as *const libc::iovec as *mut libc::iovec, + msg_iov: std::ptr::from_ref(iov).cast_mut(), msg_iovlen: 1, msg_control: std::ptr::null_mut(), msg_controllen: 0, diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index d15627216b..2acb826547 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -2,8 +2,8 @@ // Licensed under the MIT License. #![cfg(windows)] -// UNSAFETY: Calling Win32 APIs to set TCP initial RTO and to check host IPv6 -// addresses. +// UNSAFETY: Calling Win32 APIs to set TCP initial RTO, to configure UDP GSO +// via setsockopt, and to check host IPv6 addresses. #![expect(unsafe_code)] use socket2::Socket; @@ -99,73 +99,52 @@ pub fn host_has_ipv6_address() -> Result { Ok(has_ipv6) } -/// Send `data` as a UDP GSO batch using `WSASendMsg` with a -/// `UDP_SEND_MSG_SIZE` control message so the Windows network stack splits -/// it into datagrams of `seg_size` bytes each. -pub fn send_udp_with_gso( - socket: &UdpSocket, - data: &[u8], - dst: &SocketAddr, - seg_size: u16, -) -> std::io::Result { - // UDP_SEND_MSG_SIZE tells WSASendMsg the per-segment size. - const UDP_SEND_MSG_SIZE: i32 = 2; - - let sockaddr = socket2::SockAddr::from(*dst); - let seg_size_dword = seg_size as u32; - - let buf = WinSock::WSABUF { - len: data.len() as u32, - buf: data.as_ptr() as *mut u8, - }; - - let cmsg_space = - // SAFETY: computing the buffer size for a single u32 WSA cmsg. - unsafe { WinSock::WSA_CMSG_SPACE(size_of::() as u32) as usize }; - let mut cmsg_buf = vec![0u8; cmsg_space]; - - let wsamsg = WinSock::WSAMSG { - name: sockaddr.as_ptr() as *mut WinSock::SOCKADDR, - namelen: sockaddr.len() as i32, - lpBuffers: &buf as *const WinSock::WSABUF as *mut WinSock::WSABUF, - dwBufferCount: 1, - Control: WinSock::WSABUF { - buf: cmsg_buf.as_mut_ptr(), - len: cmsg_space as u32, - }, - dwFlags: 0, - }; - - // SAFETY: filling the WSAMSG control buffer per the WSA_CMSG documentation. - let cmsg = unsafe { &mut *WinSock::WSA_CMSG_FIRSTHDR(&wsamsg) }; - cmsg.cmsg_level = WinSock::IPPROTO_UDP as i32; - cmsg.cmsg_type = UDP_SEND_MSG_SIZE; - cmsg.cmsg_len = - // SAFETY: computing cmsg_len for a single u32 data field. - unsafe { WinSock::WSA_CMSG_LEN(size_of::() as u32) }; - // SAFETY: writing a u32 into the CMSG data area, which is correctly - // sized for a u32 payload. - unsafe { *(WinSock::WSA_CMSG_DATA(cmsg) as *mut u32) = seg_size_dword }; - - let mut bytes_sent = 0u32; - // SAFETY: calling WSASendMsg with a correctly constructed WSAMSG. +// UDP_SEND_MSG_SIZE = 2 (ws2ipdef.h, IPPROTO_UDP level). +const UDP_SEND_MSG_SIZE: i32 = 2; + +/// Configure the `UDP_SEND_MSG_SIZE` socket option on `socket`. +/// +/// When `size` is non-zero the Windows networking stack automatically splits +/// each outgoing send buffer into UDP datagrams of that many bytes. Setting +/// it to 0 disables segmentation and restores normal send behaviour. +/// +/// This is called once when the GSO segment size changes, not on every send, +/// so the option persists for the lifetime of the connection. +pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { + let raw = socket.as_raw_socket() as WinSock::SOCKET; + let size_dword = size as u32; + // SAFETY: setsockopt with a valid DWORD optval per MSDN documentation for + // UDP_SEND_MSG_SIZE. let ret = unsafe { - WinSock::WSASendMsg( - socket.as_raw_socket() as WinSock::SOCKET, - &wsamsg, - 0, - &mut bytes_sent, - null_mut(), - None, + WinSock::setsockopt( + raw, + WinSock::IPPROTO_UDP as i32, + UDP_SEND_MSG_SIZE, + std::ptr::from_ref(&size_dword).cast::(), + size_of::() as i32, ) }; - if ret == WinSock::SOCKET_ERROR { - Err(std::io::Error::from_raw_os_error( + return Err(std::io::Error::from_raw_os_error( // SAFETY: WSAGetLastError is safe to call after a socket error. unsafe { WinSock::WSAGetLastError() } as i32, - )) - } else { - Ok(bytes_sent as usize) + )); } + Ok(()) +} + +/// Send `data` as a UDP GSO batch. +/// +/// The `UDP_SEND_MSG_SIZE` socket option must already be set to the desired +/// segment size via [`set_udp_gso_size`] before calling this function. The +/// Windows networking stack then automatically splits each outgoing send into +/// datagrams of that size. The `seg_size` parameter is accepted for API +/// uniformity with the Unix implementation but is not used here. +pub fn send_udp_with_gso( + socket: &UdpSocket, + data: &[u8], + dst: &SocketAddr, + _seg_size: u16, +) -> std::io::Result { + socket.send_to(data, *dst) } From e12bfda578d3e9bed75e8ba64c0a1e8f14b20208 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 17:26:35 -0700 Subject: [PATCH 03/28] . --- .../net/net_consomme/consomme/src/lib.rs | 24 ++++- .../net/net_consomme/consomme/src/unix.rs | 1 - vm/devices/virtio/virtio_net/src/lib.rs | 66 +++++++++---- vm/devices/virtio/virtio_net/src/tests.rs | 93 +++++++++++++------ 4 files changed, 132 insertions(+), 52 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/lib.rs b/vm/devices/net/net_consomme/consomme/src/lib.rs index 63a59636e1..162aebe6a5 100644 --- a/vm/devices/net/net_consomme/consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/consomme/src/lib.rs @@ -600,12 +600,20 @@ impl Access<'_, T> { if payload.len() < IPV4_HEADER_LEN || ipv4.version() != 4 || payload.len() < ipv4.header_len().into() - || payload.len() < ipv4.total_len().into() { return Err(DropReason::MalformedPacket); } - let total_len = if checksum.tso.is_some() { + // For segmentation offload (TSO/USO) the IP total_length field may + // not reflect the actual buffer size (it can hold only one segment's + // worth or wrap for payloads > 64 KiB). Use the real buffer length + // instead. + let segmentation_offload = checksum.tso.is_some() || checksum.gso.is_some(); + if !segmentation_offload && payload.len() < ipv4.total_len().into() { + return Err(DropReason::MalformedPacket); + } + + let total_len = if segmentation_offload { payload.len() } else { ipv4.total_len().into() @@ -653,9 +661,15 @@ impl Access<'_, T> { return Err(DropReason::MalformedPacket); } - let required_len = smoltcp::wire::IPV6_HEADER_LEN + ipv6.payload_len() as usize; - if payload.len() < required_len { - return Err(DropReason::MalformedPacket); + // For segmentation offload (TSO/USO) the IPv6 payload_length field + // may not reflect the actual buffer size. Skip the length validation + // and use the full buffer. + let segmentation_offload = checksum.tso.is_some() || checksum.gso.is_some(); + if !segmentation_offload { + let required_len = smoltcp::wire::IPV6_HEADER_LEN + ipv6.payload_len() as usize; + if payload.len() < required_len { + return Err(DropReason::MalformedPacket); + } } let next_header = ipv6.next_header(); diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index eafc450c79..f95596e2f4 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -167,7 +167,6 @@ pub fn send_udp_with_gso( Ok(iovecs[..sent as usize].iter().map(|iov| iov.iov_len).sum()) } - /// Checks whether the host has at least one non-link-local, non-loopback /// IPv6 unicast address assigned. pub fn host_has_ipv6_address() -> Result { diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index 1eff332f31..98ef12db4c 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -262,20 +262,26 @@ impl VirtioDevice for Device { // must compute per-segment IPv4 header checksums. let host_tso4 = offloads.tso && offloads.tcp && offloads.ipv4_header; let host_tso6 = offloads.tso && offloads.tcp; - // VIRTIO_NET_F_HOST_UFO: we can handle UDP segmentation from the guest. - let host_ufo = offloads.ufo && offloads.udp; + // VIRTIO_NET_F_HOST_USO (bank 1): we can handle UDP segmentation from + // the guest. This is the modern USO feature (bit 56); the legacy + // HOST_UFO (bit 14) is not offered because it is deprecated in modern + // Linux kernels and causes connectivity issues. + let host_uso = offloads.ufo && offloads.udp; let features_bank0 = NetworkFeaturesBank0::new() .with_mac(true) .with_csum(csum) .with_guest_csum(true) .with_host_tso4(host_tso4) - .with_host_tso6(host_tso6) - .with_host_ufo(host_ufo); + .with_host_tso6(host_tso6); + + let features_bank1 = NetworkFeaturesBank1::new().with_host_uso(host_uso); DeviceTraits { device_id: 1, - device_features: VirtioDeviceFeatures::new().with_bank(0, features_bank0.into_bits()), + device_features: VirtioDeviceFeatures::new() + .with_bank(0, features_bank0.into_bits()) + .with_bank(1, features_bank1.into_bits()), max_queues: 2 * self.registers.max_virtqueue_pairs, device_register_length: size_of::() as u32, shared_memory: DeviceTraitsSharedMemory { id: 0, size: 0 }, @@ -324,6 +330,7 @@ impl VirtioDevice for Device { .context("failed creating virtio net queue")?; let negotiated_features = NetworkFeaturesBank0::from(features.bank(0)); + let negotiated_features_bank1 = NetworkFeaturesBank1::from(features.bank(1)); let pair_idx = (idx / 2) as usize; let is_rx = idx.is_multiple_of(2); @@ -379,7 +386,12 @@ impl VirtioDevice for Device { tx_queue, tx_queue_size, }; - self.insert_worker(virtio_state, pair_idx, negotiated_features); + self.insert_worker( + virtio_state, + pair_idx, + negotiated_features, + negotiated_features_bank1, + ); if first_pair { self.coordinator.start(); @@ -609,6 +621,7 @@ impl Device { virtio_state: VirtioState, idx: usize, negotiated_features: NetworkFeaturesBank0, + negotiated_features_bank1: NetworkFeaturesBank1, ) { let mut builder = self.driver_source.builder(); // TODO: set this correctly @@ -629,6 +642,7 @@ impl Device { virtio_state, active_state, negotiated_features, + negotiated_features_bank1, }; let coordinator = self.coordinator.state_mut().unwrap(); let worker_task = &mut coordinator.workers[idx]; @@ -835,6 +849,8 @@ struct Worker { active_state: ActiveState, #[inspect(skip)] negotiated_features: NetworkFeaturesBank0, + #[inspect(skip)] + negotiated_features_bank1: NetworkFeaturesBank1, } impl Worker { @@ -1021,6 +1037,7 @@ impl Worker { packet_prefix, packet_len, self.negotiated_features, + self.negotiated_features_bank1, ); self.active_state.data.tx_segments[seg_start].ty = TxSegmentType::Head(TxMetadata { @@ -1044,6 +1061,7 @@ impl Worker { packet_prefix: &[u8], packet_len: u32, features: NetworkFeaturesBank0, + features_bank1: NetworkFeaturesBank1, ) -> TxMetadata { let Some(header) = header else { return TxMetadata::default(); @@ -1122,13 +1140,17 @@ impl Worker { } // GSO (segmentation offload) — only honor if the corresponding - // HOST_TSO/HOST_UFO feature was negotiated. - let gso_enabled = match gso_protocol { - VirtioNetHeaderGsoProtocol::TCPV4 => features.host_tso4(), - VirtioNetHeaderGsoProtocol::TCPV6 => features.host_tso6(), - VirtioNetHeaderGsoProtocol::UDP => features.host_ufo(), - _ => false, - }; + // HOST_TSO/HOST_USO feature was negotiated. Per the virtio spec, all + // GSO features require VIRTIO_NET_F_CSUM; guard against a misbehaving + // guest that negotiates GSO without CSUM. + let gso_enabled = features.csum() + && match gso_protocol { + VirtioNetHeaderGsoProtocol::TCPV4 => features.host_tso4(), + VirtioNetHeaderGsoProtocol::TCPV6 => features.host_tso6(), + VirtioNetHeaderGsoProtocol::UDP => features.host_ufo(), + VirtioNetHeaderGsoProtocol::UDP_L4 => features_bank1.host_uso(), + _ => false, + }; if gso_enabled { if l2_len == 0 { l2_len = parsed_l2_len; @@ -1141,7 +1163,8 @@ impl Worker { l3_len = header.csum_start - l2_len as u16; } - let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP; + let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP + || gso_protocol == VirtioNetHeaderGsoProtocol::UDP_L4; // Derive l4_len from hdr_len if available: // hdr_len = l2_len + l3_len + l4_len (total header length) @@ -1156,8 +1179,7 @@ impl Worker { // For UDP GSO, hdr_len==0 is acceptable (fixed 8-byte header). // For TCP GSO, we need a valid l4_len to proceed. - let valid_lengths = - l3_len > 0 && (l4_len > 0 || (is_udp && header.hdr_len == 0)); + let valid_lengths = l3_len > 0 && (l4_len > 0 || (is_udp && header.hdr_len == 0)); if valid_lengths { if is_udp { @@ -1176,8 +1198,16 @@ impl Worker { // UDP GSO has no separate TCPV4/TCPV6 variants, // so derive IP version from EtherType; TCP uses GSO type. - let is_ipv4 = if is_udp { is_ipv4_from_eth } else { is_ipv4_from_gso }; - let is_ipv6 = if is_udp { is_ipv6_from_eth } else { is_ipv6_from_gso }; + let is_ipv4 = if is_udp { + is_ipv4_from_eth + } else { + is_ipv4_from_gso + }; + let is_ipv6 = if is_udp { + is_ipv6_from_eth + } else { + is_ipv6_from_gso + }; flags.set_is_ipv4(is_ipv4); flags.set_is_ipv6(is_ipv6); if is_ipv4 { diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index 790b692ac8..ea7a043fb0 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -43,6 +43,7 @@ use vmcore::vm_task::VmTaskDriverSource; use crate::Device; use crate::NetworkFeaturesBank0; +use crate::NetworkFeaturesBank1; use crate::VirtioNetHeader; use crate::VirtioNetHeaderFlags; use crate::VirtioNetHeaderGso; @@ -1127,11 +1128,14 @@ fn features_tso() -> NetworkFeaturesBank0 { .with_host_tso6(true) } -/// Features with CSUM + HOST_UFO enabled (for UFO tests). -fn features_ufo() -> NetworkFeaturesBank0 { - NetworkFeaturesBank0::new() - .with_csum(true) - .with_host_ufo(true) +/// Features with HOST_USO enabled in bank 1 (for USO tests). +fn features_uso_bank1() -> NetworkFeaturesBank1 { + NetworkFeaturesBank1::new().with_host_uso(true) +} + +/// Default (empty) bank 1 features for non-USO tests. +fn no_bank1() -> NetworkFeaturesBank1 { + NetworkFeaturesBank1::new() } /// Build a minimal Ethernet header (14 bytes) with the given EtherType. @@ -1173,7 +1177,7 @@ fn make_virtio_header( #[test] fn tx_offload_no_offloads() { let header = VirtioNetHeader::new_zeroed(); - let meta = Worker::parse_tx_offloads(Some(&header), &[], 1000, features_csum()); + let meta = Worker::parse_tx_offloads(Some(&header), &[], 1000, features_csum(), no_bank1()); assert_eq!( meta.flags.into_bits(), TxFlags::new().into_bits(), @@ -1187,7 +1191,7 @@ fn tx_offload_no_offloads() { /// No header at all → default TxMetadata. #[test] fn tx_offload_none_header() { - let meta = Worker::parse_tx_offloads(None, &[], 500, features_csum()); + let meta = Worker::parse_tx_offloads(None, &[], 500, features_csum(), no_bank1()); assert_eq!(meta.flags.into_bits(), TxFlags::new().into_bits()); } @@ -1208,6 +1212,7 @@ fn tx_offload_tcp_checksum() { &make_eth_header(ETHERTYPE_IPV4), 1000, features_csum(), + no_bank1(), ); assert!(meta.flags.offload_tcp_checksum(), "TCP csum should be set"); assert!( @@ -1241,6 +1246,7 @@ fn tx_offload_udp_checksum() { &make_eth_header(ETHERTYPE_IPV4), 800, features_csum(), + no_bank1(), ); assert!(!meta.flags.offload_tcp_checksum()); assert!(meta.flags.offload_udp_checksum(), "UDP csum should be set"); @@ -1266,6 +1272,7 @@ fn tx_offload_ipv6_tcp_checksum() { &make_eth_header(ETHERTYPE_IPV6), 1000, features_csum(), + no_bank1(), ); assert!(meta.flags.offload_tcp_checksum()); // EtherType tells us this is IPv6. @@ -1291,6 +1298,7 @@ fn tx_offload_tso4() { &make_eth_header(ETHERTYPE_IPV4), 64000, features_tso(), + no_bank1(), ); assert!(meta.flags.offload_tcp_segmentation(), "TSO should be set"); assert!( @@ -1326,6 +1334,7 @@ fn tx_offload_tso6() { &make_eth_header(ETHERTYPE_IPV6), 64000, features_tso(), + no_bank1(), ); assert!(meta.flags.offload_tcp_segmentation()); assert!(meta.flags.offload_tcp_checksum()); @@ -1357,19 +1366,20 @@ fn tx_offload_tso_without_needs_csum() { &make_eth_header(ETHERTYPE_IPV4), 64000, features_tso(), + no_bank1(), ); assert!(meta.flags.offload_tcp_segmentation()); assert!(meta.flags.is_ipv4()); assert_eq!(meta.max_tcp_segment_size, 1460); } -/// UFO4: gso_type=UDP with IPv4 EtherType. +/// USO4: gso_type=UDP_L4 with IPv4 EtherType. #[test] -fn tx_offload_ufo4() { - // IPv4 UFO: 14 (L2) + 20 (L3) + 8 (UDP) = 42 byte header +fn tx_offload_uso4() { + // IPv4 USO: 14 (L2) + 20 (L3) + 8 (UDP) = 42 byte header let header = make_virtio_header( true, - VirtioNetHeaderGsoProtocol::UDP, + VirtioNetHeaderGsoProtocol::UDP_L4, 42, // hdr_len 1472, // gso_size (UDP payload per segment) 34, // csum_start (14 + 20) @@ -1379,15 +1389,13 @@ fn tx_offload_ufo4() { Some(&header), &make_eth_header(ETHERTYPE_IPV4), 65000, - features_ufo(), - ); - assert!( - meta.flags.offload_udp_segmentation(), - "UFO should be set" + features_csum(), + features_uso_bank1(), ); + assert!(meta.flags.offload_udp_segmentation(), "USO should be set"); assert!( meta.flags.offload_udp_checksum(), - "UDP csum should be set for UFO" + "UDP csum should be set for USO" ); assert!( meta.flags.offload_ip_header_checksum(), @@ -1401,13 +1409,13 @@ fn tx_offload_ufo4() { assert!(!meta.flags.offload_tcp_segmentation()); } -/// UFO6: gso_type=UDP with IPv6 EtherType. +/// USO6: gso_type=UDP_L4 with IPv6 EtherType. #[test] -fn tx_offload_ufo6() { - // IPv6 UFO: 14 (L2) + 40 (L3) + 8 (UDP) = 62 byte header +fn tx_offload_uso6() { + // IPv6 USO: 14 (L2) + 40 (L3) + 8 (UDP) = 62 byte header let header = make_virtio_header( true, - VirtioNetHeaderGsoProtocol::UDP, + VirtioNetHeaderGsoProtocol::UDP_L4, 62, // hdr_len 1452, // gso_size 54, // csum_start (14 + 40) @@ -1417,9 +1425,10 @@ fn tx_offload_ufo6() { Some(&header), &make_eth_header(ETHERTYPE_IPV6), 65000, - features_ufo(), + features_csum(), + features_uso_bank1(), ); - assert!(meta.flags.offload_udp_segmentation(), "UFO should be set"); + assert!(meta.flags.offload_udp_segmentation(), "USO should be set"); assert!(meta.flags.offload_udp_checksum()); assert!( !meta.flags.offload_ip_header_checksum(), @@ -1451,7 +1460,13 @@ fn tx_offload_vlan_ipv4() { 38, // csum_start (18 + 20) 16, // TCP checksum offset ); - let meta = Worker::parse_tx_offloads(Some(&header), &vlan_header, 1000, features_csum()); + let meta = Worker::parse_tx_offloads( + Some(&header), + &vlan_header, + 1000, + features_csum(), + no_bank1(), + ); assert!(meta.flags.offload_tcp_checksum(), "TCP csum should be set"); assert!(meta.flags.is_ipv4(), "should detect IPv4 through VLAN tag"); assert!(!meta.flags.is_ipv6()); @@ -1476,7 +1491,13 @@ fn tx_offload_vlan_ipv6() { 58, // csum_start (18 + 40) 6, // UDP checksum offset ); - let meta = Worker::parse_tx_offloads(Some(&header), &vlan_header, 1000, features_csum()); + let meta = Worker::parse_tx_offloads( + Some(&header), + &vlan_header, + 1000, + features_csum(), + no_bank1(), + ); assert!(meta.flags.offload_udp_checksum(), "UDP csum should be set"); assert!(meta.flags.is_ipv6(), "should detect IPv6 through VLAN tag"); assert!(!meta.flags.is_ipv4()); @@ -1698,8 +1719,8 @@ async fn rx_offload_data_valid_validated_but_wrong(driver: DefaultDriver) { // --- Feature Negotiation Tests --- -/// Verify that the device advertises CSUM and HOST_TSO features when the -/// endpoint supports TCP/UDP/TSO offloads. +/// Verify that the device advertises CSUM, HOST_TSO, and HOST_USO features +/// when the endpoint supports TCP/UDP/TSO/USO offloads. #[async_test] async fn feature_negotiation_with_offloads(driver: DefaultDriver) { let mem = GuestMemory::allocate(4096); @@ -1712,7 +1733,7 @@ async fn feature_negotiation_with_offloads(driver: DefaultDriver) { tcp: true, udp: true, tso: true, - ufo: false, + ufo: true, }, }; @@ -1734,6 +1755,16 @@ async fn feature_negotiation_with_offloads(driver: DefaultDriver) { bank0.host_tso6(), "HOST_TSO6 should be set when tso+tcp supported" ); + assert!( + !bank0.host_ufo(), + "HOST_UFO (legacy) should not be set — use HOST_USO instead" + ); + + let bank1 = NetworkFeaturesBank1::from(traits.device_features.bank(1)); + assert!( + bank1.host_uso(), + "HOST_USO should be set when ufo+udp offloads supported" + ); } /// Verify that the device does NOT advertise offload features when the @@ -1763,6 +1794,12 @@ async fn feature_negotiation_no_offloads(driver: DefaultDriver) { !bank0.host_tso6(), "HOST_TSO6 should not be set without TSO" ); + + let bank1 = NetworkFeaturesBank1::from(traits.device_features.bank(1)); + assert!( + !bank1.host_uso(), + "HOST_USO should not be set without USO offloads" + ); } // Mock endpoint that reports specific offload support. From f5a6664d62e325895fb4440a9d990ab75b038062 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 20:57:59 -0700 Subject: [PATCH 04/28] mixed up terminology --- vm/devices/net/net_backend/src/lib.rs | 4 ++-- vm/devices/net/net_backend/src/null.rs | 2 +- vm/devices/net/net_consomme/src/lib.rs | 2 +- vm/devices/virtio/virtio_net/src/lib.rs | 2 +- vm/devices/virtio/virtio_net/src/tests.rs | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index fe8d937c81..5fc4ec57ae 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -130,8 +130,8 @@ pub struct TxOffloadSupport { pub udp: bool, /// TCP segmentation offload. pub tso: bool, - /// UDP segmentation offload (UFO). - pub ufo: bool, + /// UDP segmentation offload (USO). + pub uso: bool, } #[derive(Debug, Clone)] diff --git a/vm/devices/net/net_backend/src/null.rs b/vm/devices/net/net_backend/src/null.rs index 6d02215d4e..51afc6f1da 100644 --- a/vm/devices/net/net_backend/src/null.rs +++ b/vm/devices/net/net_backend/src/null.rs @@ -85,7 +85,7 @@ impl Endpoint for NullEndpoint { tcp: true, udp: true, tso: true, - ufo: true, + uso: true, } } diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 1d05483f83..c95f15727d 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -217,7 +217,7 @@ impl net_backend::Endpoint for ConsommeEndpoint { tcp: true, udp: true, tso: true, - ufo: true, + uso: true, } } } diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index 98ef12db4c..e2f3ecec49 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -266,7 +266,7 @@ impl VirtioDevice for Device { // the guest. This is the modern USO feature (bit 56); the legacy // HOST_UFO (bit 14) is not offered because it is deprecated in modern // Linux kernels and causes connectivity issues. - let host_uso = offloads.ufo && offloads.udp; + let host_uso = offloads.uso && offloads.udp; let features_bank0 = NetworkFeaturesBank0::new() .with_mac(true) diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index ea7a043fb0..9f38735b77 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -1733,7 +1733,7 @@ async fn feature_negotiation_with_offloads(driver: DefaultDriver) { tcp: true, udp: true, tso: true, - ufo: true, + uso: true, }, }; From 0bc4f6df3c4f23ff6e91d2be540b9bf5f848bc6d Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 21:22:32 -0700 Subject: [PATCH 05/28] minor cleanup --- .../net/net_consomme/consomme/src/udp.rs | 20 +++---- .../net/net_consomme/consomme/src/unix.rs | 55 +++++++++---------- .../net/net_consomme/consomme/src/windows.rs | 10 ++-- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/udp.rs b/vm/devices/net/net_consomme/consomme/src/udp.rs index 82243fe693..e1a1be71df 100644 --- a/vm/devices/net/net_consomme/consomme/src/udp.rs +++ b/vm/devices/net/net_consomme/consomme/src/udp.rs @@ -95,9 +95,7 @@ struct UdpConnection { recycle: bool, #[inspect(debug)] last_activity: Instant, - /// The UDP GSO segment size currently configured on the socket (0 = disabled). - /// Tracked to avoid redundant setsockopt calls on every packet. - gso_size: u16, + gso_size: Option, } #[derive(Inspect, Default)] @@ -308,16 +306,12 @@ impl Access<'_, T> { let conn = self.get_or_insert(guest_addr, Some(frame.src_addr))?; let socket = conn.socket.as_ref().unwrap().get(); - let new_gso = checksum.gso.unwrap_or(0); - if conn.gso_size != new_gso { - platform::set_udp_gso_size(socket, new_gso).map_err(DropReason::Io)?; - conn.gso_size = new_gso; + if conn.gso_size != checksum.gso { + platform::set_udp_gso_size(socket, checksum.gso.unwrap_or(0)) + .map_err(DropReason::Io)?; + conn.gso_size = checksum.gso; } - let result = if let Some(seg_size) = checksum.gso { - platform::send_udp_with_gso(socket, udp_packet.payload(), &dst_sock_addr, seg_size) - } else { - socket.send_to(udp_packet.payload(), dst_sock_addr) - }; + let result = platform::send_to(socket, udp_packet.payload(), &dst_sock_addr, checksum.gso); match result { Ok(_) => { conn.stats.tx_packets.increment(); @@ -362,7 +356,7 @@ impl Access<'_, T> { stats: Default::default(), recycle: false, last_activity: Instant::now(), - gso_size: 0, + gso_size: None, }; Ok(e.insert(conn)) } diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index f95596e2f4..799df33a48 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -5,11 +5,10 @@ //! Unix platform helpers for consomme. //! //! - IPv6 address detection via `getifaddrs()`. -//! - UDP GSO batch send: +//! - UDP GSO send: //! - Linux: `setsockopt(IPPROTO_UDP, UDP_SEGMENT)` sets the segment size -//! once per connection; subsequent `send_to()` calls are segmented by the -//! kernel automatically. -//! - macOS: `sendmsg_x()` private API (user-space segments, one syscall). +//! once per connection +//! - macOS: `sendmsg_x()` private API. // UNSAFETY: getifaddrs/freeifaddrs; setsockopt for UDP_SEGMENT (Linux); // sendmsg_x (private Apple API) with a manually built msghdr_x array. @@ -18,13 +17,13 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::UdpSocket; +use std::os::unix::io::AsRawFd; /// Configure the UDP GSO segment size on `socket`. /// /// On Linux this calls `setsockopt(IPPROTO_UDP, UDP_SEGMENT, size)`, which /// persists for the lifetime of the connection. When `size` is 0 the option -/// is cleared and normal (non-GSO) sends resume. On macOS `sendmsg_x` conveys -/// the segment size per-call, so this is a no-op. +/// is cleared and normal (non-GSO) sends resume. #[cfg(target_os = "linux")] pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { use std::mem::size_of; @@ -56,43 +55,45 @@ pub fn set_udp_gso_size(_socket: &UdpSocket, _size: u16) -> std::io::Result<()> Ok(()) } -/// Send `data` as a UDP GSO batch. +/// Send `data` to `dst` via `socket`, using UDP GSO if `gso` is `Some`. /// /// On Linux, `UDP_SEGMENT` must already be configured on the socket via /// [`set_udp_gso_size`]. The kernel then automatically splits the outgoing -/// buffer into datagrams of that size. The `seg_size` parameter is accepted -/// for API uniformity with the macOS implementation but is not used here. +/// buffer into datagrams of that size, so this is just a plain `send_to` +/// regardless of the `gso` value. #[cfg(target_os = "linux")] -pub fn send_udp_with_gso( +pub fn send_to( socket: &UdpSocket, data: &[u8], dst: &SocketAddr, - _seg_size: u16, + _gso: Option, ) -> std::io::Result { socket.send_to(data, *dst) } -/// macOS batch send using the private `sendmsg_x()` API. +/// Send `data` to `dst` via `socket`, using UDP GSO if `gso` is `Some`. /// -/// `sendmsg_x()` and `msghdr_x` are undocumented Apple extensions (present -/// since macOS 10.11) that allow sending multiple datagrams in a single -/// syscall. `msghdr_x` is identical to the standard `msghdr` except for an -/// extra `msg_datalen` field that records the byte count for each entry. -/// `sendmsg_x` returns the number of messages queued, not bytes. +/// When `gso` is `None`, this is a plain `send_to`. When `gso` is +/// `Some(seg_size)`, macOS uses the private `sendmsg_x()` API to batch +/// multiple datagrams in a single syscall (user-space segmentation). /// -/// This gives us user-space segmentation with a single syscall, rather than -/// one syscall per segment. +/// `sendmsg_x()` and `msghdr_x` are undocumented Apple extensions (present +/// since macOS 10.11). `msghdr_x` is identical to the standard `msghdr` +/// except for an extra `msg_datalen` field that records the byte count for +/// each entry. `sendmsg_x` returns the number of messages queued, not bytes. #[cfg(target_os = "macos")] -pub fn send_udp_with_gso( +pub fn send_to( socket: &UdpSocket, data: &[u8], dst: &SocketAddr, - seg_size: u16, + gso: Option, ) -> std::io::Result { - use std::os::unix::io::AsRawFd; + let Some(seg_size) = gso else { + return socket.send_to(data, *dst); + }; - // Private Apple extension of msghdr: identical layout up to msg_flags, - // then an extra msg_datalen field that holds the per-entry byte count. + // Private Apple extension of msghdr + // Adapted from: https://github.com/apple-oss-distributions/xnu/blob/8d741a5de7ff4191bf97d57b9f54c2f6d4a15585/bsd/sys/socket_private.h #[repr(C)] struct MsghdrX { msg_name: *mut libc::c_void, @@ -119,8 +120,7 @@ pub fn send_udp_with_gso( let sockaddr = socket2::SockAddr::from(*dst); let seg_size = seg_size as usize; - // Build one iovec per segment. Collected up front so the Vec's heap - // allocation is stable before we take raw pointers into it. + // Build one iovec per segment. let iovecs: Vec = data .chunks(seg_size) .map(|chunk| libc::iovec { @@ -129,8 +129,7 @@ pub fn send_udp_with_gso( }) .collect(); - // Build a matching msghdr_x per segment. Each entry shares the same - // destination address and points to its own iovec. + // Build a matching msghdr_x per segment. let hdrs: Vec = iovecs .iter() .map(|iov| MsghdrX { diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index 2acb826547..0717e09eb3 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -133,18 +133,18 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { Ok(()) } -/// Send `data` as a UDP GSO batch. +/// Send `data` to `dst` via `socket`, using UDP GSO if `gso` is `Some`. /// /// The `UDP_SEND_MSG_SIZE` socket option must already be set to the desired /// segment size via [`set_udp_gso_size`] before calling this function. The /// Windows networking stack then automatically splits each outgoing send into -/// datagrams of that size. The `seg_size` parameter is accepted for API -/// uniformity with the Unix implementation but is not used here. -pub fn send_udp_with_gso( +/// datagrams of that size, so this is just a plain `send_to` regardless of +/// the `gso` value. +pub fn send_to( socket: &UdpSocket, data: &[u8], dst: &SocketAddr, - _seg_size: u16, + _gso: Option, ) -> std::io::Result { socket.send_to(data, *dst) } From 2cb5959088f8fe0e2e78ef0ec371504de641c11c Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 22:02:55 -0700 Subject: [PATCH 06/28] . --- .../net/net_consomme/consomme/src/windows.rs | 8 +--- vm/devices/virtio/virtio_net/src/lib.rs | 37 ++++++------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index 0717e09eb3..231b7f9f22 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -133,13 +133,7 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { Ok(()) } -/// Send `data` to `dst` via `socket`, using UDP GSO if `gso` is `Some`. -/// -/// The `UDP_SEND_MSG_SIZE` socket option must already be set to the desired -/// segment size via [`set_udp_gso_size`] before calling this function. The -/// Windows networking stack then automatically splits each outgoing send into -/// datagrams of that size, so this is just a plain `send_to` regardless of -/// the `gso` value. +/// Send `data` to `dst` via `socket`. pub fn send_to( socket: &UdpSocket, data: &[u8], diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index 8acb8172f5..5af7249ddd 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -263,7 +263,7 @@ impl VirtioDevice for Device { // VIRTIO_NET_F_HOST_USO (bank 1): we can handle UDP segmentation from // the guest. This is the modern USO feature (bit 56); the legacy // HOST_UFO (bit 14) is not offered because it is deprecated in modern - // Linux kernels and causes connectivity issues. + // Linux kernels. let host_uso = offloads.uso && offloads.udp; let features_bank0 = NetworkFeaturesBank0::new() @@ -1077,16 +1077,19 @@ impl Worker { let mut max_tcp_segment_size: u16 = 0; let mut max_udp_segment_size: u16 = 0; - // Determine IP version from GSO type when available. - let is_ipv4_from_gso = gso_protocol == VirtioNetHeaderGsoProtocol::TCPV4; - let is_ipv6_from_gso = gso_protocol == VirtioNetHeaderGsoProtocol::TCPV6; - // Parse the Ethernet header to determine IP version and L2 length. - // EtherType is at offset 12. If it's 0x8100 (VLAN), the real - // EtherType is at offset 16 and L2 is 18 bytes. let (parsed_l2_len, is_ipv4_from_eth, is_ipv6_from_eth) = Self::parse_ethertype(packet_prefix); + // Resolve IP version. TCP GSO types (TCPV4/TCPV6) encode the IP + // version directly. For everything else (UDP_L4, NONE), fall back + // to the EtherType parsed from the Ethernet header. + let (is_ipv4, is_ipv6) = match gso_protocol { + VirtioNetHeaderGsoProtocol::TCPV4 => (true, false), + VirtioNetHeaderGsoProtocol::TCPV6 => (false, true), + _ => (is_ipv4_from_eth, is_ipv6_from_eth), + }; + // Only honor NEEDS_CSUM if VIRTIO_NET_F_CSUM was negotiated. if flags_byte.needs_csum() && features.csum() { // The guest requests partial checksum offload. @@ -1117,10 +1120,6 @@ impl Worker { flags.set_offload_udp_checksum(true); } - // Prefer GSO-derived IP version, then EtherType-derived. - let is_ipv4 = is_ipv4_from_gso || (!is_ipv6_from_gso && is_ipv4_from_eth); - let is_ipv6 = is_ipv6_from_gso || (!is_ipv4_from_gso && is_ipv6_from_eth); - // Only enable checksum offloads if we know the IP version; // backends require consistent is_ipv4/is_ipv6 and header lengths. if !is_ipv4 && !is_ipv6 { @@ -1146,7 +1145,6 @@ impl Worker { && match gso_protocol { VirtioNetHeaderGsoProtocol::TCPV4 => features.host_tso4(), VirtioNetHeaderGsoProtocol::TCPV6 => features.host_tso6(), - VirtioNetHeaderGsoProtocol::UDP => features.host_ufo(), VirtioNetHeaderGsoProtocol::UDP_L4 => features_bank1.host_uso(), _ => false, }; @@ -1162,8 +1160,7 @@ impl Worker { l3_len = header.csum_start - l2_len as u16; } - let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP - || gso_protocol == VirtioNetHeaderGsoProtocol::UDP_L4; + let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP_L4; // Derive l4_len from hdr_len if available: // hdr_len = l2_len + l3_len + l4_len (total header length) @@ -1195,18 +1192,6 @@ impl Worker { max_tcp_segment_size = header.gso_size; } - // UDP GSO has no separate TCPV4/TCPV6 variants, - // so derive IP version from EtherType; TCP uses GSO type. - let is_ipv4 = if is_udp { - is_ipv4_from_eth - } else { - is_ipv4_from_gso - }; - let is_ipv6 = if is_udp { - is_ipv6_from_eth - } else { - is_ipv6_from_gso - }; flags.set_is_ipv4(is_ipv4); flags.set_is_ipv6(is_ipv6); if is_ipv4 { From 541f3fe47f76680ae623a566357b10e42e4d2754 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Sat, 21 Mar 2026 22:04:22 -0700 Subject: [PATCH 07/28] . --- .../net/net_consomme/consomme/src/unix.rs | 6 ++--- .../net/net_consomme/consomme/src/windows.rs | 23 ++++--------------- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 799df33a48..87522efc51 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -14,6 +14,7 @@ // sendmsg_x (private Apple API) with a manually built msghdr_x array. #![expect(unsafe_code)] +use std::mem::size_of; use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::UdpSocket; @@ -26,9 +27,6 @@ use std::os::unix::io::AsRawFd; /// is cleared and normal (non-GSO) sends resume. #[cfg(target_os = "linux")] pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { - use std::mem::size_of; - use std::os::unix::io::AsRawFd; - // SAFETY: setsockopt with a valid u16 optval per Linux udp(7) documentation // for UDP_SEGMENT. let ret = unsafe { @@ -66,7 +64,7 @@ pub fn send_to( socket: &UdpSocket, data: &[u8], dst: &SocketAddr, - _gso: Option, + _: Option, ) -> std::io::Result { socket.send_to(data, *dst) } diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index 0717e09eb3..86563e09af 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -17,6 +17,7 @@ use windows_sys::Win32::Foundation::ERROR_SUCCESS; use windows_sys::Win32::NetworkManagement::IpHelper::MIB_UNICASTIPADDRESS_TABLE; use windows_sys::Win32::Networking::WinSock; use windows_sys::Win32::Networking::WinSock::AF_INET6; +use windows_sys::Win32::Networking::WinSock::UDP_SEND_MSG_SIZE; pub fn disable_connection_retries(sock: &Socket) -> Result<(), i32> { const TCP_INITIAL_RTO_UNSPECIFIED_RTT: u16 = 0xffff; @@ -99,17 +100,7 @@ pub fn host_has_ipv6_address() -> Result { Ok(has_ipv6) } -// UDP_SEND_MSG_SIZE = 2 (ws2ipdef.h, IPPROTO_UDP level). -const UDP_SEND_MSG_SIZE: i32 = 2; - /// Configure the `UDP_SEND_MSG_SIZE` socket option on `socket`. -/// -/// When `size` is non-zero the Windows networking stack automatically splits -/// each outgoing send buffer into UDP datagrams of that many bytes. Setting -/// it to 0 disables segmentation and restores normal send behaviour. -/// -/// This is called once when the GSO segment size changes, not on every send, -/// so the option persists for the lifetime of the connection. pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { let raw = socket.as_raw_socket() as WinSock::SOCKET; let size_dword = size as u32; @@ -118,7 +109,7 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { let ret = unsafe { WinSock::setsockopt( raw, - WinSock::IPPROTO_UDP as i32, + WinSock::IPPROTO_UDP, UDP_SEND_MSG_SIZE, std::ptr::from_ref(&size_dword).cast::(), size_of::() as i32, @@ -133,18 +124,12 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { Ok(()) } -/// Send `data` to `dst` via `socket`, using UDP GSO if `gso` is `Some`. -/// -/// The `UDP_SEND_MSG_SIZE` socket option must already be set to the desired -/// segment size via [`set_udp_gso_size`] before calling this function. The -/// Windows networking stack then automatically splits each outgoing send into -/// datagrams of that size, so this is just a plain `send_to` regardless of -/// the `gso` value. +/// Send `data` to `dst` via `socket` pub fn send_to( socket: &UdpSocket, data: &[u8], dst: &SocketAddr, - _gso: Option, + _: Option, ) -> std::io::Result { socket.send_to(data, *dst) } From 648d1565b274cd3a6c5199a322d4fc7ae159b287 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Tue, 24 Mar 2026 21:14:07 -0700 Subject: [PATCH 08/28] copilot feedback --- vm/devices/net/net_backend/src/lib.rs | 2 +- vm/devices/net/net_consomme/consomme/src/unix.rs | 4 ++-- vm/devices/net/net_consomme/consomme/src/windows.rs | 2 +- vm/devices/virtio/virtio_net/src/tests.rs | 6 +----- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index 5fc4ec57ae..8f22699c06 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -361,7 +361,7 @@ pub struct TxFlags { pub is_ipv4: bool, /// If true, the packet is IPv6. Mutually exclusive with `is_ipv4`. pub is_ipv6: bool, - /// Offload UDP segmentation (UFO), allowing UDP packets larger than the + /// Offload UDP segmentation (USO), allowing UDP packets larger than the /// MTU. `l2_len`, `l3_len`, and `max_udp_segment_size` must be set. pub offload_udp_segmentation: bool, #[bits(1)] diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 87522efc51..4e39c02655 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -25,7 +25,7 @@ use std::os::unix::io::AsRawFd; /// On Linux this calls `setsockopt(IPPROTO_UDP, UDP_SEGMENT, size)`, which /// persists for the lifetime of the connection. When `size` is 0 the option /// is cleared and normal (non-GSO) sends resume. -#[cfg(target_os = "linux")] +#[cfg(not(target_os = "macos"))] pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { // SAFETY: setsockopt with a valid u16 optval per Linux udp(7) documentation // for UDP_SEGMENT. @@ -59,7 +59,7 @@ pub fn set_udp_gso_size(_socket: &UdpSocket, _size: u16) -> std::io::Result<()> /// [`set_udp_gso_size`]. The kernel then automatically splits the outgoing /// buffer into datagrams of that size, so this is just a plain `send_to` /// regardless of the `gso` value. -#[cfg(target_os = "linux")] +#[cfg(not(target_os = "macos"))] pub fn send_to( socket: &UdpSocket, data: &[u8], diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index c4ada18472..f6e184f157 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -111,7 +111,7 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { raw, WinSock::IPPROTO_UDP, UDP_SEND_MSG_SIZE, - std::ptr::from_ref(&size_dword).cast::(), + std::ptr::from_ref(&size_dword).cast::(), size_of::() as i32, ) }; diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index e50b09ce6b..acaeac4787 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -1758,15 +1758,11 @@ async fn feature_negotiation_with_offloads(driver: DefaultDriver) { bank0.host_tso6(), "HOST_TSO6 should be set when tso+tcp supported" ); - assert!( - !bank0.host_ufo(), - "HOST_UFO (legacy) should not be set — use HOST_USO instead" - ); let bank1 = NetworkFeaturesBank1::from(traits.device_features.bank(1)); assert!( bank1.host_uso(), - "HOST_USO should be set when ufo+udp offloads supported" + "HOST_USO should be set when uso+udp offloads supported" ); } From d2378dccd441759434e6296f0180e016bcb1d38e Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Tue, 24 Mar 2026 21:16:24 -0700 Subject: [PATCH 09/28] . --- vm/devices/net/net_mana/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 7e0cae4903..88fe2e81af 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -525,6 +525,7 @@ impl Endpoint for ManaEndpoint { udp: true, // Tbe bounce buffer path does not support TSO. tso: !self.bounce_buffer, + uso: false, } } From 18730aa034d1dfb4fbbf4bfe8f347c2220498e8f Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Tue, 24 Mar 2026 21:31:37 -0700 Subject: [PATCH 10/28] . --- vm/devices/net/net_tap/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index e6916cb239..be26a46c61 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -185,6 +185,7 @@ impl Endpoint for TapEndpoint { tcp: true, udp: true, tso: true, + uso: false, } } } From b413c3e4401b9874386d0ac6daade82d187e9121 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Wed, 25 Mar 2026 21:15:44 -0700 Subject: [PATCH 11/28] fmt --- vm/devices/net/net_tap/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index be26a46c61..d52c17a9d2 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -185,7 +185,7 @@ impl Endpoint for TapEndpoint { tcp: true, udp: true, tso: true, - uso: false, + uso: false, } } } From 21d1fba056510599569932944e3b692af6e71a4f Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Wed, 25 Mar 2026 21:18:33 -0700 Subject: [PATCH 12/28] compile fix --- vm/devices/net/net_consomme/consomme/src/windows.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vm/devices/net/net_consomme/consomme/src/windows.rs b/vm/devices/net/net_consomme/consomme/src/windows.rs index f6e184f157..c4ada18472 100644 --- a/vm/devices/net/net_consomme/consomme/src/windows.rs +++ b/vm/devices/net/net_consomme/consomme/src/windows.rs @@ -111,7 +111,7 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { raw, WinSock::IPPROTO_UDP, UDP_SEND_MSG_SIZE, - std::ptr::from_ref(&size_dword).cast::(), + std::ptr::from_ref(&size_dword).cast::(), size_of::() as i32, ) }; From ab4d128571a1068eb729c3482fc8742b9e0b3557 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 9 Apr 2026 09:43:45 -0700 Subject: [PATCH 13/28] feedback --- vm/devices/net/gdma/src/bnic.rs | 5 ++--- vm/devices/net/net_backend/src/lib.rs | 15 ++++++--------- vm/devices/net/net_consomme/consomme/src/lib.rs | 5 ++++- vm/devices/net/net_consomme/src/lib.rs | 5 +++-- vm/devices/net/net_mana/src/lib.rs | 2 +- vm/devices/net/net_mana/src/test.rs | 2 +- vm/devices/net/net_tap/src/lib.rs | 6 +++--- vm/devices/net/netvsp/src/lib.rs | 2 +- vm/devices/net/netvsp/src/test.rs | 1 + vm/devices/virtio/virtio_net/src/lib.rs | 10 ++++------ vm/devices/virtio/virtio_net/src/tests.rs | 12 ++++++------ 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/vm/devices/net/gdma/src/bnic.rs b/vm/devices/net/gdma/src/bnic.rs index 0fe74b8004..d34f4c9e5d 100644 --- a/vm/devices/net/gdma/src/bnic.rs +++ b/vm/devices/net/gdma/src/bnic.rs @@ -544,15 +544,14 @@ impl TxRxTask { l2_len: 14, l3_len: oob.s_oob.trans_off().clamp(14, 255) - 14, l4_len: 0, - max_tcp_segment_size: 0, - max_udp_segment_size: 0, + max_segment_size: 0, }; if sqe.header.params.client_oob_in_sgl() { meta.l4_len = sge0.size .saturating_sub(meta.l2_len as u32 + meta.l3_len as u32) as u8; - meta.max_tcp_segment_size = sqe.header.params.gd_client_unit_data(); + meta.max_segment_size = sqe.header.params.gd_client_unit_data(); meta.flags.set_offload_tcp_segmentation(true); } diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index 8f22699c06..8d52339871 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -328,12 +328,10 @@ pub struct TxMetadata { /// The length of the TCP header. Only guaranteed to be set if various /// offload flags are set. pub l4_len: u8, - /// The maximum TCP segment size, used for segmentation. Only guaranteed to - /// be set if [`TxFlags::offload_tcp_segmentation`] is set. - pub max_tcp_segment_size: u16, - /// The maximum UDP segment size, used for UDP segmentation offload. Only - /// guaranteed to be set if [`TxFlags::offload_udp_segmentation`] is set. - pub max_udp_segment_size: u16, + /// The maximum segment size, used for segmentation offload (TSO or USO). + /// Only guaranteed to be set if [`TxFlags::offload_tcp_segmentation`] or + /// [`TxFlags::offload_udp_segmentation`] is set. + pub max_segment_size: u16, } /// Flags affecting transmit behavior. @@ -362,7 +360,7 @@ pub struct TxFlags { /// If true, the packet is IPv6. Mutually exclusive with `is_ipv4`. pub is_ipv6: bool, /// Offload UDP segmentation (USO), allowing UDP packets larger than the - /// MTU. `l2_len`, `l3_len`, and `max_udp_segment_size` must be set. + /// MTU. `l2_len`, `l3_len`, and `max_segment_size` must be set. pub offload_udp_segmentation: bool, #[bits(1)] _reserved: u8, @@ -378,8 +376,7 @@ impl Default for TxMetadata { l2_len: 0, l3_len: 0, l4_len: 0, - max_tcp_segment_size: 0, - max_udp_segment_size: 0, + max_segment_size: 0, } } } diff --git a/vm/devices/net/net_consomme/consomme/src/lib.rs b/vm/devices/net/net_consomme/consomme/src/lib.rs index 162aebe6a5..938996247f 100644 --- a/vm/devices/net/net_consomme/consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/consomme/src/lib.rs @@ -399,6 +399,9 @@ pub enum DropReason { /// E.g. a TCP packet with both SYN and FIN flags set. #[error("packet is malformed")] MalformedPacket, + /// The IP total-length field does not match the buffer size. + #[error("ip length mismatch")] + IpLengthMismatch, /// An incoming IP packet has been split into several IP fragments and was dropped, /// since IP reassembly is not supported. #[error("packet fragmentation is not supported")] @@ -610,7 +613,7 @@ impl Access<'_, T> { // instead. let segmentation_offload = checksum.tso.is_some() || checksum.gso.is_some(); if !segmentation_offload && payload.len() < ipv4.total_len().into() { - return Err(DropReason::MalformedPacket); + return Err(DropReason::IpLengthMismatch); } let total_len = if segmentation_offload { diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index c95f15727d..d6057f69a3 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -332,11 +332,11 @@ impl net_backend::Queue for ConsommeQueue { tso: meta .flags .offload_tcp_segmentation() - .then_some(meta.max_tcp_segment_size), + .then_some(meta.max_segment_size), gso: meta .flags .offload_udp_segmentation() - .then_some(meta.max_udp_segment_size), + .then_some(meta.max_segment_size), }; let mut buf = vec![0; meta.len as usize]; @@ -368,6 +368,7 @@ impl net_backend::Queue for ConsommeQueue { | consomme::DropReason::Io(_) | consomme::DropReason::BadTcpState(_) | consomme::DropReason::FragmentedPacket + | consomme::DropReason::IpLengthMismatch | consomme::DropReason::MalformedPacket => self.stats.tx_errors.increment(), consomme::DropReason::PortNotBound => unreachable!(), } diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 88fe2e81af..450b4ba7ef 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -1191,7 +1191,7 @@ impl ManaQueue { return Ok(None); } builder.set_client_oob_in_sgl(header_len as u8); - builder.set_gd_client_unit_data(meta.max_tcp_segment_size); + builder.set_gd_client_unit_data(meta.max_segment_size); let (head_iova, used_segments, used_segments_len) = if header_len > head.len || self.force_tx_header_bounce { diff --git a/vm/devices/net/net_mana/src/test.rs b/vm/devices/net/net_mana/src/test.rs index 12ca34ba9c..a02660264c 100644 --- a/vm/devices/net/net_mana/src/test.rs +++ b/vm/devices/net/net_mana/src/test.rs @@ -665,7 +665,7 @@ fn build_tx_segments( l2_len: 14, // Ethernet header l3_len: 20, // IPv4 header l4_len: 20, // TCP header - max_tcp_segment_size: 1460, // Typical MSS for Ethernet + max_segment_size: 1460, // Typical MSS for Ethernet ..Default::default() }; diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index d52c17a9d2..2dfb552c9f 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -185,7 +185,7 @@ impl Endpoint for TapEndpoint { tcp: true, udp: true, tso: true, - uso: false, + uso: true, } } } @@ -435,7 +435,7 @@ fn build_vnet_hdr(meta: &TxMetadata) -> VirtioNetHdr { flags: VirtioNetHdrFlags::new().with_needs_csum(true), gso_type: VirtioNetHdrGso::new().with_protocol(protocol), hdr_len: meta.l2_len as u16 + meta.l3_len + meta.l4_len as u16, - gso_size: meta.max_tcp_segment_size, + gso_size: meta.max_segment_size, csum_start: meta.l2_len as u16 + meta.l3_len, csum_offset: 16, // TCP checksum field offset num_buffers: 0, @@ -531,7 +531,7 @@ mod tests { l2_len: 14, l3_len: 20, l4_len: 32, - max_tcp_segment_size: 1460, + max_segment_size: 1460, ..Default::default() }; let hdr = build_vnet_hdr(&meta); diff --git a/vm/devices/net/netvsp/src/lib.rs b/vm/devices/net/netvsp/src/lib.rs index 0782f12811..8fc5416122 100644 --- a/vm/devices/net/netvsp/src/lib.rs +++ b/vm/devices/net/netvsp/src/lib.rs @@ -2576,7 +2576,7 @@ impl NetChannel { reader.read(std::slice::from_mut(&mut b))?; (b >> 4) * 4 }; - metadata.max_tcp_segment_size = n.mss() as u16; + metadata.max_segment_size = n.mss() as u16; if request.data_length >= rndisprot::LSO_MAX_OFFLOAD_SIZE { // Not strictly enforced. diff --git a/vm/devices/net/netvsp/src/test.rs b/vm/devices/net/netvsp/src/test.rs index 6f5058c9e1..9b2fd8a141 100644 --- a/vm/devices/net/netvsp/src/test.rs +++ b/vm/devices/net/netvsp/src/test.rs @@ -200,6 +200,7 @@ impl TestNicEndpoint { tcp: true, udp: true, tso: true, + uso: false, }; let multiqueue_support = MultiQueueSupport { max_queues: u16::MAX, diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index 5af7249ddd..01dcefadea 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -1074,8 +1074,7 @@ impl Worker { let mut l2_len: u8 = 0; let mut l3_len: u16 = 0; let mut l4_len: u8 = 0; - let mut max_tcp_segment_size: u16 = 0; - let mut max_udp_segment_size: u16 = 0; + let mut max_segment_size: u16 = 0; // Parse the Ethernet header to determine IP version and L2 length. let (parsed_l2_len, is_ipv4_from_eth, is_ipv6_from_eth) = @@ -1184,12 +1183,12 @@ impl Worker { // the backend to fill it in. flags.set_offload_udp_checksum(true); flags.set_offload_tcp_checksum(false); - max_udp_segment_size = header.gso_size; + max_segment_size = header.gso_size; } else { flags.set_offload_tcp_segmentation(true); flags.set_offload_tcp_checksum(true); flags.set_offload_udp_checksum(false); - max_tcp_segment_size = header.gso_size; + max_segment_size = header.gso_size; } flags.set_is_ipv4(is_ipv4); @@ -1206,8 +1205,7 @@ impl Worker { l2_len, l3_len, l4_len, - max_tcp_segment_size, - max_udp_segment_size, + max_segment_size, ..Default::default() } } diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index acaeac4787..be1666dad6 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -1189,7 +1189,7 @@ fn tx_offload_no_offloads() { ); assert_eq!(meta.l2_len, 0); assert_eq!(meta.l3_len, 0); - assert_eq!(meta.max_tcp_segment_size, 0); + assert_eq!(meta.max_segment_size, 0); } /// No header at all → default TxMetadata. @@ -1318,7 +1318,7 @@ fn tx_offload_tso4() { assert_eq!(meta.l2_len, 14); assert_eq!(meta.l3_len, 20); assert_eq!(meta.l4_len, 32); // 66 - 14 - 20 = 32 - assert_eq!(meta.max_tcp_segment_size, 1460); + assert_eq!(meta.max_segment_size, 1460); } /// TSO6: gso_type=TCPV6 with needs_csum. @@ -1351,7 +1351,7 @@ fn tx_offload_tso6() { assert_eq!(meta.l2_len, 14); assert_eq!(meta.l3_len, 40); assert_eq!(meta.l4_len, 20); // 74 - 14 - 40 = 20 - assert_eq!(meta.max_tcp_segment_size, 1440); + assert_eq!(meta.max_segment_size, 1440); } /// TSO without needs_csum: GSO fields should still be parsed. @@ -1374,7 +1374,7 @@ fn tx_offload_tso_without_needs_csum() { ); assert!(meta.flags.offload_tcp_segmentation()); assert!(meta.flags.is_ipv4()); - assert_eq!(meta.max_tcp_segment_size, 1460); + assert_eq!(meta.max_segment_size, 1460); } /// USO4: gso_type=UDP_L4 with IPv4 EtherType. @@ -1409,7 +1409,7 @@ fn tx_offload_uso4() { assert!(!meta.flags.is_ipv6()); assert_eq!(meta.l2_len, 14); assert_eq!(meta.l3_len, 20); - assert_eq!(meta.max_udp_segment_size, 1472); + assert_eq!(meta.max_segment_size, 1472); assert!(!meta.flags.offload_tcp_segmentation()); } @@ -1442,7 +1442,7 @@ fn tx_offload_uso6() { assert!(meta.flags.is_ipv6()); assert_eq!(meta.l2_len, 14); assert_eq!(meta.l3_len, 40); - assert_eq!(meta.max_udp_segment_size, 1452); + assert_eq!(meta.max_segment_size, 1452); assert!(!meta.flags.offload_tcp_segmentation()); } From f660f0d66f3db63b2f7be31d10321936d0128727 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 9 Apr 2026 14:36:51 -0700 Subject: [PATCH 14/28] bad merge + feedback --- vm/devices/net/net_tap/src/lib.rs | 37 +++++++++++++++++++++++-- vm/devices/virtio/virtio_net/src/lib.rs | 6 +--- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index e7ee377a4b..da9bcb69c6 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -79,6 +79,7 @@ mod vnet_hdr { TCPV4 = 1, UDP = 3, TCPV6 = 4, + UDP_L4 = 5, } } @@ -434,7 +435,8 @@ fn fixup_ipv4_header_checksum(packet: &mut [u8], l2_len: usize) { /// /// For TSO, `gso_type` is set based on the `is_ipv4`/`is_ipv6` flags, and /// `NEEDS_CSUM` is always set since the kernel requires the checksum to be -/// partially computed when performing segmentation. +/// partially computed when performing segmentation. For USO, +/// `gso_type` is set to `UDP_L4` and the UDP header length (8) is used. /// /// If no offload flags are set, an all-zero header is returned, which tells the /// TAP device that the packet requires no special handling. @@ -454,6 +456,16 @@ fn build_vnet_hdr(meta: &TxMetadata) -> VirtioNetHdr { csum_offset: 16, // TCP checksum field offset num_buffers: 0, } + } else if meta.flags.offload_udp_segmentation() { + VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::UDP_L4), + hdr_len: meta.l2_len as u16 + meta.l3_len + 8, // 8 = UDP header length + gso_size: meta.max_segment_size, + csum_start: meta.l2_len as u16 + meta.l3_len, + csum_offset: 6, // UDP checksum field offset + num_buffers: 0, + } } else if meta.flags.offload_tcp_checksum() { VirtioNetHdr { flags: VirtioNetHdrFlags::new().with_needs_csum(true), @@ -498,7 +510,7 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { let l4_protocol = match hdr.gso_type.protocol() { VirtioNetHdrGsoProtocol::TCPV4 | VirtioNetHdrGsoProtocol::TCPV6 => L4Protocol::Tcp, - VirtioNetHdrGsoProtocol::UDP => L4Protocol::Udp, + VirtioNetHdrGsoProtocol::UDP | VirtioNetHdrGsoProtocol::UDP_L4 => L4Protocol::Udp, _ => L4Protocol::Unknown, }; @@ -636,6 +648,27 @@ mod tests { assert_eq!(meta.l4_protocol, L4Protocol::Udp); } + #[test] + fn vnet_hdr_from_tx_metadata_uso() { + let meta = TxMetadata { + flags: TxFlags::new() + .with_offload_udp_segmentation(true) + .with_offload_udp_checksum(true) + .with_is_ipv4(true), + l2_len: 14, + l3_len: 20, + max_segment_size: 1472, + ..Default::default() + }; + let hdr = build_vnet_hdr(&meta); + assert_eq!(hdr.gso_type.protocol(), VirtioNetHdrGsoProtocol::UDP_L4); + assert_eq!(hdr.gso_size, 1472); + assert_eq!(hdr.hdr_len, 14 + 20 + 8); + assert!(hdr.flags.needs_csum()); + assert_eq!(hdr.csum_start, 14 + 20); + assert_eq!(hdr.csum_offset, 6); + } + #[test] fn ipv4_header_checksum_fixup() { // Ethernet (14) + IPv4 header (20) with zero checksum field. diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index 4f3e0d2124..ecd3c45e64 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -257,16 +257,12 @@ impl VirtioDevice for Device { // VIRTIO_NET_F_CSUM: we can handle partial checksum from the guest let csum = offloads.tcp && offloads.udp; // VIRTIO_NET_F_HOST_TSO4/6: we can handle TSO from the guest - // TSO4 also requires IPv4 header checksum support since the backend - // must compute per-segment IPv4 header checksums. - let host_tso4 = offloads.tso && offloads.tcp && offloads.ipv4_header; - let host_tso6 = offloads.tso && offloads.tcp; + let host_tso = offloads.tso && offloads.tcp; // VIRTIO_NET_F_HOST_USO (bank 1): we can handle UDP segmentation from // the guest. This is the modern USO feature (bit 56); the legacy // HOST_UFO (bit 14) is not offered because it is deprecated in modern // Linux kernels. let host_uso = offloads.uso && offloads.udp; - let host_tso = offloads.tso && offloads.tcp; let features_bank0 = NetworkFeaturesBank0::new() .with_mac(true) From fb0e6f1e9c7e143c011592f324e8f9a7ff853e0c Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 9 Apr 2026 14:58:25 -0700 Subject: [PATCH 15/28] . --- vm/devices/net/net_consomme/consomme/src/unix.rs | 9 +++++++++ vm/devices/net/net_mana/src/test.rs | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 4e39c02655..01bf22ccd8 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -118,6 +118,15 @@ pub fn send_to( let sockaddr = socket2::SockAddr::from(*dst); let seg_size = seg_size as usize; + // Guard against guest-controlled seg_size of 0, which would panic in + // chunks(), and degenerate sizes that would produce excessive allocations. + if seg_size == 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "GSO segment size must be non-zero", + )); + } + // Build one iovec per segment. let iovecs: Vec = data .chunks(seg_size) diff --git a/vm/devices/net/net_mana/src/test.rs b/vm/devices/net/net_mana/src/test.rs index c7d85823d4..c8479d6baa 100644 --- a/vm/devices/net/net_mana/src/test.rs +++ b/vm/devices/net/net_mana/src/test.rs @@ -689,9 +689,9 @@ fn build_tx_segments( id: TxId(tx_id), segment_count: num_segments as u8, len: packet_len as u32, - l2_len: 14, // Ethernet header - l3_len: 20, // IPv4 header - l4_len: 20, // TCP header + l2_len: 14, // Ethernet header + l3_len: 20, // IPv4 header + l4_len: 20, // TCP header max_segment_size: 1460, // Typical MSS for Ethernet ..Default::default() }; From 3600e11ec8c9a9d5d8236a07122ab1ec76b97d10 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 9 Apr 2026 15:25:24 -0700 Subject: [PATCH 16/28] . --- Guide/src/reference/backends/networking.md | 2 +- vm/devices/net/net_tap/tests/tap_tests.rs | 2 +- vm/devices/virtio/virtio_net/src/lib.rs | 23 ++++++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Guide/src/reference/backends/networking.md b/Guide/src/reference/backends/networking.md index b07b5117d7..c3956db77f 100644 --- a/Guide/src/reference/backends/networking.md +++ b/Guide/src/reference/backends/networking.md @@ -62,7 +62,7 @@ for the full trait signatures and type definitions. 1. The guest posts a TX descriptor (e.g. a virtio descriptor chain or a VMBus RNDIS message). 2. The frontend reads the descriptor from guest memory, extracts any - offload metadata (checksum, TSO), and builds a `TxSegment` array. + offload metadata (checksum, TSO, USO), and builds a `TxSegment` array. Each segment carries a guest physical address and a length — **no data is copied** at this point. 3. The frontend calls `queue.tx_avail(&mut pool, &segments)`. The diff --git a/vm/devices/net/net_tap/tests/tap_tests.rs b/vm/devices/net/net_tap/tests/tap_tests.rs index 4866e259fa..7f275f0281 100644 --- a/vm/devices/net/net_tap/tests/tap_tests.rs +++ b/vm/devices/net/net_tap/tests/tap_tests.rs @@ -572,7 +572,7 @@ mod tap_tests { l2_len: 14, l3_len: 20, l4_len: 20, - max_tcp_segment_size: 1460, + max_segment_size: 1460, }), gpa: 0, len: frame_len, diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index ecd3c45e64..abb0706387 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -1131,9 +1131,13 @@ impl Worker { // GSO (segmentation offload) — only honor if the corresponding // HOST_TSO/HOST_USO feature was negotiated. Per the virtio spec, all - // GSO features require VIRTIO_NET_F_CSUM; guard against a misbehaving - // guest that negotiates GSO without CSUM. - let gso_enabled = features.csum() + // GSO features require VIRTIO_NET_F_CSUM and packets must set + // NEEDS_CSUM; guard against a misbehaving guest that sends GSO + // packets without CSUM negotiated or without the per-packet flag. + // Requiring NEEDS_CSUM ensures that the checksum-field validation + // above has run and produced validated l2_len/l3_len values. + let gso_enabled = flags_byte.needs_csum() + && features.csum() && match gso_protocol { VirtioNetHeaderGsoProtocol::TCPV4 => features.host_tso4(), VirtioNetHeaderGsoProtocol::TCPV6 => features.host_tso6(), @@ -1147,10 +1151,9 @@ impl Worker { // Validate gso_size and l2_len before enabling segmentation. if l2_len > 0 && header.gso_size > 0 { - // Derive l3_len from csum_start if we haven't already. - if l3_len == 0 && header.csum_start > l2_len as u16 { - l3_len = header.csum_start - l2_len as u16; - } + // l3_len was derived from the validated csum_start in the + // NEEDS_CSUM block above. Do not re-derive it here from + // unvalidated header fields. let is_udp = gso_protocol == VirtioNetHeaderGsoProtocol::UDP_L4; @@ -1167,7 +1170,11 @@ impl Worker { // For UDP GSO, hdr_len==0 is acceptable (fixed 8-byte header). // For TCP GSO, we need a valid l4_len to proceed. - let valid_lengths = l3_len > 0 && (l4_len > 0 || (is_udp && header.hdr_len == 0)); + // Also require a known IP version; backends need consistent + // is_ipv4/is_ipv6 flags for correct offload processing. + let valid_lengths = l3_len > 0 + && (is_ipv4 || is_ipv6) + && (l4_len > 0 || (is_udp && header.hdr_len == 0)); if valid_lengths { if is_udp { From b4f6c3a404401db5aa2182d037538ff6e5b48dd9 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 9 Apr 2026 16:08:53 -0700 Subject: [PATCH 17/28] . --- .../net/net_consomme/consomme/src/unix.rs | 32 +++++++++++++++++-- vm/devices/virtio/virtio_net/src/tests.rs | 17 +++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 01bf22ccd8..3cc44f8c35 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -14,7 +14,9 @@ // sendmsg_x (private Apple API) with a manually built msghdr_x array. #![expect(unsafe_code)] +#[cfg(target_os = "linux")] use std::mem::size_of; + use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::UdpSocket; @@ -25,7 +27,7 @@ use std::os::unix::io::AsRawFd; /// On Linux this calls `setsockopt(IPPROTO_UDP, UDP_SEGMENT, size)`, which /// persists for the lifetime of the connection. When `size` is 0 the option /// is cleared and normal (non-GSO) sends resume. -#[cfg(not(target_os = "macos"))] +#[cfg(target_os = "linux")] pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { // SAFETY: setsockopt with a valid u16 optval per Linux udp(7) documentation // for UDP_SEGMENT. @@ -48,7 +50,10 @@ pub fn set_udp_gso_size(socket: &UdpSocket, size: u16) -> std::io::Result<()> { /// /// On macOS the segment size is conveyed per-send via `sendmsg_x`, so there /// is nothing to configure on the socket itself. This is a no-op. -#[cfg(target_os = "macos")] +/// +/// On other Unix targets (e.g. *BSD), UDP GSO is not supported, so this is +/// also a no-op. +#[cfg(not(target_os = "linux"))] pub fn set_udp_gso_size(_socket: &UdpSocket, _size: u16) -> std::io::Result<()> { Ok(()) } @@ -59,6 +64,9 @@ pub fn set_udp_gso_size(_socket: &UdpSocket, _size: u16) -> std::io::Result<()> /// [`set_udp_gso_size`]. The kernel then automatically splits the outgoing /// buffer into datagrams of that size, so this is just a plain `send_to` /// regardless of the `gso` value. +/// +/// On other non-macOS Unix targets, this is also a plain `send_to` (GSO is +/// not supported). #[cfg(not(target_os = "macos"))] pub fn send_to( socket: &UdpSocket, @@ -127,6 +135,20 @@ pub fn send_to( )); } + // Cap the number of segments to avoid large allocations and excessive CPU + // from guest-controlled small segment sizes (e.g., seg_size = 1 on a 64 KB + // buffer would produce 65 535 entries). When the cap is exceeded, fall back + // to sending each chunk individually. + const MAX_BATCH_SEGMENTS: usize = 64; + let num_segments = data.len().div_ceil(seg_size); + if num_segments > MAX_BATCH_SEGMENTS { + let mut total = 0; + for chunk in data.chunks(seg_size) { + total += socket.send_to(chunk, *dst)?; + } + return Ok(total); + } + // Build one iovec per segment. let iovecs: Vec = data .chunks(seg_size) @@ -168,9 +190,13 @@ pub fn send_to( return Err(std::io::Error::last_os_error()); } + // Clamp to the number of messages we actually submitted. sendmsg_x is a + // private API, so defensively guard against an out-of-range return value. + let sent = (sent as usize).min(iovecs.len()); + // sendmsg_x returns the number of messages queued. Sum the byte counts of // the successfully sent entries to produce the total byte count. - Ok(iovecs[..sent as usize].iter().map(|iov| iov.iov_len).sum()) + Ok(iovecs[..sent].iter().map(|iov| iov.iov_len).sum()) } /// Checks whether the host has at least one non-link-local, non-loopback diff --git a/vm/devices/virtio/virtio_net/src/tests.rs b/vm/devices/virtio/virtio_net/src/tests.rs index 9340d9ccbb..7a7e07a9fd 100644 --- a/vm/devices/virtio/virtio_net/src/tests.rs +++ b/vm/devices/virtio/virtio_net/src/tests.rs @@ -1301,9 +1301,11 @@ fn tx_offload_tso6() { assert_eq!(meta.max_segment_size, 1440); } -/// TSO without needs_csum: GSO fields should still be parsed. +/// TSO without needs_csum: per the virtio spec, all GSO features require +/// VIRTIO_NET_F_CSUM and packets must set NEEDS_CSUM. When needs_csum is +/// false, segmentation offload must not be enabled. #[test] -fn tx_offload_tso_without_needs_csum() { +fn tx_offload_tso_without_needs_csum_ignored() { let header = make_virtio_header( false, // no needs_csum VirtioNetHeaderGsoProtocol::TCPV4, @@ -1319,9 +1321,14 @@ fn tx_offload_tso_without_needs_csum() { features_tso(), no_bank1(), ); - assert!(meta.flags.offload_tcp_segmentation()); - assert!(meta.flags.is_ipv4()); - assert_eq!(meta.max_segment_size, 1460); + assert!( + !meta.flags.offload_tcp_segmentation(), + "TSO must not be set without NEEDS_CSUM" + ); + assert_eq!( + meta.max_segment_size, 0, + "no segmentation without NEEDS_CSUM" + ); } /// USO4: gso_type=UDP_L4 with IPv4 EtherType. From 525b0264c03350587e9cc4db24efa93dbdfb0bf4 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 16 Apr 2026 14:52:26 -0700 Subject: [PATCH 18/28] wip --- vm/devices/net/net_backend/src/lib.rs | 16 +++ vm/devices/net/net_consomme/src/lib.rs | 77 ++++++++++-- vm/devices/net/net_tap/src/lib.rs | 133 +++++++++++++++----- vm/devices/virtio/virtio_net/src/buffers.rs | 36 ++++++ vm/devices/virtio/virtio_net/src/lib.rs | 2 + 5 files changed, 227 insertions(+), 37 deletions(-) diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index 1c53b2fe87..8b8f74e565 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -306,6 +306,17 @@ pub struct RxMetadata { pub l4_checksum: RxChecksumState, /// The L4 protocol. pub l4_protocol: L4Protocol, + /// The L3 protocol (IPv4/IPv6). Used for GSO/LRO metadata. + pub l3_protocol: L3Protocol, + /// L2 (Ethernet) header length in bytes (e.g. 14, or 18 with VLAN). + pub l2_len: u8, + /// L3 (IP) header length in bytes. + pub l3_len: u16, + /// L4 (TCP/UDP) header length in bytes. + pub l4_len: u8, + /// If non-zero, this is a GSO/LRO packet and this value is the MSS + /// (maximum segment size) that should be advertised to the guest. + pub gso_size: u16, } impl Default for RxMetadata { @@ -316,6 +327,11 @@ impl Default for RxMetadata { ip_checksum: RxChecksumState::Unknown, l4_checksum: RxChecksumState::Unknown, l4_protocol: L4Protocol::Unknown, + l3_protocol: L3Protocol::Unknown, + l2_len: 0, + l3_len: 0, + l4_len: 0, + gso_size: 0, } } } diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 39ffb9a4ee..95a48f8fbc 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -17,6 +17,7 @@ use mesh::rpc::Rpc; use mesh::rpc::RpcError; use mesh::rpc::RpcSend; use net_backend::BufferAccess; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::QueueConfig; use net_backend::RssConfig; @@ -494,6 +495,22 @@ impl consomme::Client for Client<'_> { }; let max = self.pool.capacity(rx_id) as usize; if data.len() <= max { + let l4_protocol = if checksum.tcp { + L4Protocol::Tcp + } else if checksum.udp { + L4Protocol::Udp + } else { + L4Protocol::Unknown + }; + + // Determine L3 protocol and header lengths for GSO metadata. + // Parse the Ethernet header to find IP version, then derive + // l2_len and l3_len from the packet. + let (l3_protocol, l2_len, l3_len, l4_len) = + parse_rx_header_lengths(data, checksum); + + let gso_size = checksum.tso.unwrap_or(0); + self.pool.write_packet( rx_id, &RxMetadata { @@ -509,13 +526,12 @@ impl consomme::Client for Client<'_> { } else { RxChecksumState::Unknown }, - l4_protocol: if checksum.tcp { - L4Protocol::Tcp - } else if checksum.udp { - L4Protocol::Udp - } else { - L4Protocol::Unknown - }, + l4_protocol, + l3_protocol, + gso_size, + l2_len, + l3_len, + l4_len, }, data, ); @@ -534,3 +550,50 @@ impl consomme::Client for Client<'_> { } } } + +/// Parse an Ethernet frame to extract L3 protocol, l2_len, l3_len, and l4_len. +/// +/// Used to populate `RxMetadata` GSO fields on the receive path so that +/// the virtio-net device can construct proper virtio headers for LRO packets. +fn parse_rx_header_lengths( + data: &[u8], + checksum: &ChecksumState, +) -> (L3Protocol, u8, u16, u8) { + const ETHERTYPE_IPV4: u16 = 0x0800; + const ETHERTYPE_IPV6: u16 = 0x86DD; + + if data.len() < 14 { + return (L3Protocol::Unknown, 0, 0, 0); + } + + let ethertype = u16::from_be_bytes([data[12], data[13]]); + let l2_len: u8 = 14; + + match ethertype { + ETHERTYPE_IPV4 if checksum.ipv4 && data.len() >= l2_len as usize + 20 => { + let ihl = (data[l2_len as usize] & 0x0f) as u16 * 4; + let l3_len = ihl.max(20); + let l4_start = l2_len as usize + l3_len as usize; + // Derive TCP header length from data offset field if TCP + let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { + let data_offset = (data[l4_start + 12] >> 4) as u8 * 4; + data_offset.max(20) + } else { + 0 + }; + (L3Protocol::Ipv4, l2_len, l3_len, l4_len) + } + ETHERTYPE_IPV6 if data.len() >= l2_len as usize + 40 => { + let l3_len: u16 = 40; // Base IPv6 header; extension headers not handled + let l4_start = l2_len as usize + l3_len as usize; + let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { + let data_offset = (data[l4_start + 12] >> 4) as u8 * 4; + data_offset.max(20) + } else { + 0 + }; + (L3Protocol::Ipv6, l2_len, l3_len, l4_len) + } + _ => (L3Protocol::Unknown, 0, 0, 0), + } +} diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index 4568c6326e..bc16c7c2d0 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -14,6 +14,7 @@ use futures::io::AsyncRead; use inspect::InspectMut; use net_backend::BufferAccess; use net_backend::Endpoint; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::Queue; use net_backend::QueueConfig; @@ -115,25 +116,20 @@ pub struct TapEndpoint { impl TapEndpoint { pub fn new(tap: tap::Tap) -> Result { - // Do not enable any RX offloads (TUN_F_CSUM, TUN_F_TSO*, etc.). + // Enable RX offloads so the kernel can deliver large coalesced + // (GRO/LRO) TCP packets instead of segmenting them. This reduces + // per-packet overhead and improves throughput when the guest has + // negotiated VIRTIO_NET_F_GUEST_TSO4/6. // - // The TUN_F_* flags are the TAP equivalent of VIRTIO_NET_F_GUEST_*: - // they tell the kernel that our reader can handle partial checksums - // (NEEDS_CSUM) and unsegmented GSO packets. Since net_backend's - // RxMetadata has no way to represent "checksum needs to be completed" - // (only Good/Bad/Unknown), and no concept of receive-side GRO/RSC, - // accepting such packets would force us to either lie about checksum - // state or complete checksums in software. + // TUN_F_CSUM (1) — we can handle NEEDS_CSUM (partial checksum) + // TUN_F_TSO4 (2) — we can handle TSOv4 (large IPv4/TCP packets) + // TUN_F_TSO6 (4) — we can handle TSOv6 (large IPv6/TCP packets) // - // With offloads set to 0, the kernel completes all checksums and - // segments all GSO packets before delivering them to us. This is - // correct and simple. The TX path is unaffected — writes with - // NEEDS_CSUM and GSO types in the vnet header are processed by the - // kernel regardless of these flags. - // - // We explicitly set 0 rather than skipping the call, in case a - // previous user of this TAP fd set offloads to a non-zero value. - tap.set_offloads(0)?; + // TUN_F_CSUM is required for TUN_F_TSO4/6. + const TUN_F_CSUM: u32 = 1; + const TUN_F_TSO4: u32 = 2; + const TUN_F_TSO6: u32 = 4; + tap.set_offloads(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6)?; Ok(Self { tap: Arc::new(Mutex::new(Some(tap))), @@ -480,17 +476,19 @@ fn build_vnet_hdr(meta: &TxMetadata) -> VirtioNetHdr { /// Parse a `VirtioNetHdr` from the TAP device into receive metadata. /// -/// Because we do not set any `TUN_F_*` RX offload flags (see -/// [`TapEndpoint::new`]), the kernel will never send us `NEEDS_CSUM` or GSO -/// packets. We only need to handle `DATA_VALID` (checksum verified by the -/// kernel) and the default case (no information). -/// -/// The `gso_type` field should always be `GSO_NONE` since we didn't enable -/// receive-side GSO, but we still parse it defensively to extract L4 protocol -/// information if present. +/// With `TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6` enabled, the kernel may +/// deliver large coalesced packets with `NEEDS_CSUM` set and a non-NONE +/// `gso_type`. We translate these into `RxMetadata` GSO fields so the +/// virtio-net device can pass them to the guest as LRO packets. fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { let (ip_checksum, l4_checksum) = if hdr.flags.data_valid() { (RxChecksumState::Good, RxChecksumState::Good) + } else if hdr.flags.needs_csum() { + // NEEDS_CSUM means the data is valid but the L4 checksum in the + // header is incomplete (partial). For our purposes treat the + // checksums as good — the guest will be told via NEEDS_CSUM in + // the virtio header to complete them. + (RxChecksumState::Good, RxChecksumState::Good) } else { (RxChecksumState::Unknown, RxChecksumState::Unknown) }; @@ -501,12 +499,48 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { _ => L4Protocol::Unknown, }; + // Extract GSO metadata when the kernel delivers a coalesced packet. + let gso_protocol = hdr.gso_type.protocol(); + let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = + if hdr.gso_size > 0 + && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 + || gso_protocol == VirtioNetHdrGsoProtocol::TCPV6) + { + let l3_proto = if gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 { + L3Protocol::Ipv4 + } else { + L3Protocol::Ipv6 + }; + // csum_start = l2_len + l3_len; we assume standard Ethernet (14 bytes) + // unless csum_start indicates otherwise. + let l2 = if hdr.csum_start > 14 { 14u8 } else { 0 }; + let l3 = if l2 > 0 { + hdr.csum_start - l2 as u16 + } else { + 0 + }; + let l4 = if hdr.hdr_len > hdr.csum_start { + let v = hdr.hdr_len - hdr.csum_start; + if v <= u8::MAX as u16 { v as u8 } else { 0 } + } else { + 0 + }; + (l3_proto, hdr.gso_size, l2, l3, l4) + } else { + (L3Protocol::Unknown, 0, 0, 0, 0) + }; + RxMetadata { offset: 0, len: 0, ip_checksum, l4_checksum, l4_protocol, + l3_protocol, + gso_size, + l2_len, + l3_len, + l4_len, } } @@ -601,20 +635,59 @@ mod tests { } #[test] - fn rx_metadata_from_vnet_hdr_needs_csum_treated_as_unknown() { - // We don't set TUN_F_CSUM so the kernel should never send NEEDS_CSUM, - // but if it did, we conservatively treat it as Unknown (not Good). + fn rx_metadata_from_vnet_hdr_needs_csum_treated_as_good() { + // With TUN_F_CSUM enabled, NEEDS_CSUM means data is valid but + // checksum is partial — treat as Good for our purposes. let hdr = VirtioNetHdr { flags: VirtioNetHdrFlags::new().with_needs_csum(true), gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), ..Default::default() }; let meta = parse_vnet_hdr(&hdr); - assert_eq!(meta.ip_checksum, RxChecksumState::Unknown); - assert_eq!(meta.l4_checksum, RxChecksumState::Unknown); + assert_eq!(meta.ip_checksum, RxChecksumState::Good); + assert_eq!(meta.l4_checksum, RxChecksumState::Good); assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } + #[test] + fn rx_metadata_from_vnet_hdr_gso_tcpv4() { + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV4), + hdr_len: 14 + 20 + 32, // eth + ipv4 + tcp w/options + gso_size: 1460, + csum_start: 14 + 20, + csum_offset: 16, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr); + assert_eq!(meta.l3_protocol, L3Protocol::Ipv4); + assert_eq!(meta.gso_size, 1460); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 20); + assert_eq!(meta.l4_len, 32); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); + } + + #[test] + fn rx_metadata_from_vnet_hdr_gso_tcpv6() { + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), + hdr_len: 14 + 40 + 20, + gso_size: 1440, + csum_start: 14 + 40, + csum_offset: 16, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr); + assert_eq!(meta.l3_protocol, L3Protocol::Ipv6); + assert_eq!(meta.gso_size, 1440); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 40); + assert_eq!(meta.l4_len, 20); + } + #[test] fn rx_metadata_from_vnet_hdr_none() { let hdr = VirtioNetHdr::default(); diff --git a/vm/devices/virtio/virtio_net/src/buffers.rs b/vm/devices/virtio/virtio_net/src/buffers.rs index 56e475461d..2538ec8152 100644 --- a/vm/devices/virtio/virtio_net/src/buffers.rs +++ b/vm/devices/virtio/virtio_net/src/buffers.rs @@ -3,10 +3,13 @@ use crate::VirtioNetHeader; use crate::VirtioNetHeaderFlags; +use crate::VirtioNetHeaderGso; +use crate::VirtioNetHeaderGsoProtocol; use crate::header_size; use guestmem::GuestMemory; use inspect::Inspect; use net_backend::BufferAccess; +use net_backend::L3Protocol; use net_backend::RxBufferSegment; use net_backend::RxId; use net_backend::RxMetadata; @@ -170,8 +173,41 @@ impl BufferAccess for VirtioWorkPool { let data_valid = metadata.ip_checksum.is_valid() && metadata.l4_checksum.is_valid(); let flags = VirtioNetHeaderFlags::new().with_data_valid(data_valid); + // Build GSO fields when the backend indicates a large/coalesced packet. + let (gso_type, gso_size, hdr_len, csum_start, csum_offset) = + if metadata.gso_size > 0 && metadata.l2_len > 0 && metadata.l3_len > 0 { + let gso_protocol = match metadata.l3_protocol { + L3Protocol::Ipv4 => VirtioNetHeaderGsoProtocol::TCPV4, + L3Protocol::Ipv6 => VirtioNetHeaderGsoProtocol::TCPV6, + L3Protocol::Unknown => VirtioNetHeaderGsoProtocol::NONE, + }; + let gso_type_byte: u8 = + VirtioNetHeaderGso::new().with_protocol(gso_protocol).into(); + let total_hdr = + metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; + let csum_start = metadata.l2_len as u16 + metadata.l3_len; + // TCP checksum offset within TCP header is 16. + let csum_offset: u16 = 16; + (gso_type_byte, metadata.gso_size, total_hdr, csum_start, csum_offset) + } else { + (0, 0, 0, 0, 0) + }; + + // When GSO is active, set NEEDS_CSUM so the guest computes + // per-segment checksums. + let flags = if gso_size > 0 { + flags.with_needs_csum(true) + } else { + flags + }; + let virtio_net_header = VirtioNetHeader { flags: flags.into(), + gso_type, + gso_size, + hdr_len, + csum_start, + csum_offset, num_buffers: 1, ..FromZeros::new_zeroed() }; diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index cf43acd6f8..539448e7ea 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -263,6 +263,8 @@ impl VirtioDevice for Device { .with_mac(true) .with_csum(csum) .with_guest_csum(true) + .with_guest_tso4(true) + .with_guest_tso6(true) .with_host_tso4(host_tso) .with_host_tso6(host_tso); From f072562eef1063698033afefc789a8abe50d7381 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 16 Apr 2026 15:18:31 -0700 Subject: [PATCH 19/28] Add burette test --- petri/burette/src/tests/network.rs | 143 ++++++++++++++++++++ vm/devices/net/net_consomme/src/lib.rs | 8 +- vm/devices/net/net_tap/src/lib.rs | 51 ++++--- vm/devices/virtio/virtio_net/src/buffers.rs | 11 +- 4 files changed, 178 insertions(+), 35 deletions(-) diff --git a/petri/burette/src/tests/network.rs b/petri/burette/src/tests/network.rs index b3a2bd717d..749a7df901 100644 --- a/petri/burette/src/tests/network.rs +++ b/petri/burette/src/tests/network.rs @@ -72,6 +72,18 @@ pub struct NetworkTestState { _helper_mesh: mesh_process::Mesh, /// Async driver for timers. driver: pal_async::DefaultDriver, + /// Kernel interface name of the virtio-net NIC in the guest, discovered + /// during setup. Set only for `NetBackend::Tap` + `NicBackend::VirtioNet`; + /// used by the LRO-off paired metric to toggle guest-side receive + /// offloads via `ethtool -K`. + #[cfg_attr( + not(target_os = "linux"), + expect( + dead_code, + reason = "only read by TAP + virtio-net LRO-off paired metric, which is Linux-only" + ) + )] + virtio_nic_dev: Option, } fn build_firmware(resolver: &petri::ArtifactResolver<'_>) -> petri::Firmware { @@ -300,6 +312,24 @@ impl crate::harness::WarmPerfTest for NetworkTest { .await .context("failed to prepare chroot at /perf")?; + // For TAP + virtio-net, discover the guest NIC name and verify that + // GRO (the guest-visible face of VIRTIO_NET_F_GUEST_TSO4/6, i.e. LRO) + // is advertised. Captured here so `run_once` can toggle offloads + // between iterations to produce a paired LRO-off metric. + let virtio_nic_dev = match (self.backend, self.nic) { + #[cfg(target_os = "linux")] + (NetBackend::Tap, NicBackend::VirtioNet) => { + let dev = tap::discover_virtio_nic(&agent) + .await + .context("failed to discover virtio-net NIC in guest")?; + tap::verify_lro_advertised(&agent, &dev) + .await + .context("LRO not advertised on virtio-net NIC")?; + Some(dev) + } + _ => None, + }; + Ok(NetworkTestState { vm, agent, @@ -307,6 +337,7 @@ impl crate::harness::WarmPerfTest for NetworkTest { iperf_requests: ready.requests, _helper_mesh: helper_mesh, driver: driver.clone(), + virtio_nic_dev, }) } @@ -340,6 +371,31 @@ impl crate::harness::WarmPerfTest for NetworkTest { recorder.stop()?; metrics.push(m); + // Paired LRO-off TCP RX metric for TAP + virtio-net. Disable GRO/LRO/ + // TSO on the guest NIC so the virtio-net receive path cannot coalesce + // segments, rerun the same flow, and restore offloads afterwards so + // subsequent iterations start in the default LRO-on state. + #[cfg(target_os = "linux")] + if let (NetBackend::Tap, NicBackend::VirtioNet, Some(dev)) = + (self.backend, self.nic, state.virtio_nic_dev.clone()) + { + tap::set_offloads(&state.agent, &dev, false) + .await + .context("failed to disable receive offloads on guest NIC")?; + let name = format!("{prefix}_tcp_rx_lro_off_gbps"); + recorder.start(&name)?; + let result = self + .run_iperf3(state, host_ip, base_port + 3, &name, IperfMode::TcpRx) + .await; + recorder.stop()?; + // Always try to re-enable offloads even if the RX run failed so + // later iterations are not left with offloads disabled. + let restore = tap::set_offloads(&state.agent, &dev, true).await; + let m = result.context("TCP RX (LRO off) test failed")?; + restore.context("failed to re-enable receive offloads on guest NIC")?; + metrics.push(m); + } + // UDP TX (guest sends to host) let name = format!("{prefix}_udp_tx_pps"); recorder.start(&name)?; @@ -595,6 +651,93 @@ done Ok("192.168.100.1".to_string()) } + + /// Discover the kernel interface name of the virtio-net TAP NIC in the + /// guest by matching on its MAC address. + pub(super) async fn discover_virtio_nic( + agent: &petri::pipette::PipetteClient, + ) -> anyhow::Result { + let sh = agent.unix_shell(); + // Emit the device name for the interface whose MAC matches the TAP + // NIC MAC (…:13) configured above. + let script = r#" +for dev in /sys/class/net/*/; do + name=$(basename "$dev") + mac=$(cat "$dev/address") + if [ "$mac" = "00:15:5d:12:12:13" ]; then + printf '%s' "$name" + exit 0 + fi +done +exit 1 +"#; + let name = cmd!(sh, "sh -c {script}") + .read() + .await + .context("failed to locate virtio-net NIC by MAC in guest")?; + let name = name.trim().to_string(); + anyhow::ensure!( + !name.is_empty(), + "virtio-net NIC not found in /sys/class/net by MAC" + ); + tracing::info!(dev = %name, "discovered guest virtio-net NIC"); + Ok(name) + } + + /// Assert that the guest NIC advertises GRO (the receive-side face of + /// VIRTIO_NET_F_GUEST_TSO4/6, i.e. LRO). Runs `ethtool -k` from the + /// petritools erofs via the `/perf` chroot. + pub(super) async fn verify_lro_advertised( + agent: &petri::pipette::PipetteClient, + dev: &str, + ) -> anyhow::Result<()> { + let mut sh = agent.unix_shell(); + sh.chroot("/perf"); + let features = cmd!(sh, "ethtool -k {dev}") + .read() + .await + .context("failed to run `ethtool -k` in guest (missing from petritools erofs?)")?; + tracing::info!(dev = %dev, features = %features, "guest NIC offload features"); + // Only check the `on` state; we don't require the feature to be + // fixed, just that the kernel is willing to do GRO on this NIC. + let gro_on = features + .lines() + .map(str::trim) + .any(|line| line.starts_with("generic-receive-offload:") && line.contains("on")); + anyhow::ensure!( + gro_on, + "generic-receive-offload is not advertised as `on` for {dev}; \ + LRO (VIRTIO_NET_F_GUEST_TSO) is not being exercised" + ); + Ok(()) + } + + /// Toggle the guest NIC's receive-side segmentation offloads. When + /// `enabled` is false, GRO/LRO/TSO are all disabled so the virtio-net + /// receive path cannot coalesce segments; when true, they are turned + /// back on. + pub(super) async fn set_offloads( + agent: &petri::pipette::PipetteClient, + dev: &str, + enabled: bool, + ) -> anyhow::Result<()> { + let mut sh = agent.unix_shell(); + sh.chroot("/perf"); + let onoff = if enabled { "on" } else { "off" }; + // Not every feature is supported on every kernel/NIC combination — + // `ethtool -K` will fail for fixed features. Run each independently + // and ignore individual failures so an unsupported feature does not + // break the test, but log the result for debugging. + for feat in ["gro", "lro", "tso"] { + let out = cmd!(sh, "ethtool -K {dev} {feat} {onoff}") + .ignore_status() + .read() + .await + .unwrap_or_default(); + tracing::info!(dev = %dev, feat = %feat, onoff = %onoff, out = %out, "ethtool -K"); + } + Ok(()) + } } // --------------------------------------------------------------------------- diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 95a48f8fbc..421490cdf3 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -506,8 +506,7 @@ impl consomme::Client for Client<'_> { // Determine L3 protocol and header lengths for GSO metadata. // Parse the Ethernet header to find IP version, then derive // l2_len and l3_len from the packet. - let (l3_protocol, l2_len, l3_len, l4_len) = - parse_rx_header_lengths(data, checksum); + let (l3_protocol, l2_len, l3_len, l4_len) = parse_rx_header_lengths(data, checksum); let gso_size = checksum.tso.unwrap_or(0); @@ -555,10 +554,7 @@ impl consomme::Client for Client<'_> { /// /// Used to populate `RxMetadata` GSO fields on the receive path so that /// the virtio-net device can construct proper virtio headers for LRO packets. -fn parse_rx_header_lengths( - data: &[u8], - checksum: &ChecksumState, -) -> (L3Protocol, u8, u16, u8) { +fn parse_rx_header_lengths(data: &[u8], checksum: &ChecksumState) -> (L3Protocol, u8, u16, u8) { const ETHERTYPE_IPV4: u16 = 0x0800; const ETHERTYPE_IPV6: u16 = 0x86DD; diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index bc16c7c2d0..aa71fc1b48 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -501,34 +501,33 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { // Extract GSO metadata when the kernel delivers a coalesced packet. let gso_protocol = hdr.gso_type.protocol(); - let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = - if hdr.gso_size > 0 - && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 - || gso_protocol == VirtioNetHdrGsoProtocol::TCPV6) - { - let l3_proto = if gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 { - L3Protocol::Ipv4 - } else { - L3Protocol::Ipv6 - }; - // csum_start = l2_len + l3_len; we assume standard Ethernet (14 bytes) - // unless csum_start indicates otherwise. - let l2 = if hdr.csum_start > 14 { 14u8 } else { 0 }; - let l3 = if l2 > 0 { - hdr.csum_start - l2 as u16 - } else { - 0 - }; - let l4 = if hdr.hdr_len > hdr.csum_start { - let v = hdr.hdr_len - hdr.csum_start; - if v <= u8::MAX as u16 { v as u8 } else { 0 } - } else { - 0 - }; - (l3_proto, hdr.gso_size, l2, l3, l4) + let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = if hdr.gso_size > 0 + && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 + || gso_protocol == VirtioNetHdrGsoProtocol::TCPV6) + { + let l3_proto = if gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 { + L3Protocol::Ipv4 } else { - (L3Protocol::Unknown, 0, 0, 0, 0) + L3Protocol::Ipv6 }; + // csum_start = l2_len + l3_len; we assume standard Ethernet (14 bytes) + // unless csum_start indicates otherwise. + let l2 = if hdr.csum_start > 14 { 14u8 } else { 0 }; + let l3 = if l2 > 0 { + hdr.csum_start - l2 as u16 + } else { + 0 + }; + let l4 = if hdr.hdr_len > hdr.csum_start { + let v = hdr.hdr_len - hdr.csum_start; + if v <= u8::MAX as u16 { v as u8 } else { 0 } + } else { + 0 + }; + (l3_proto, hdr.gso_size, l2, l3, l4) + } else { + (L3Protocol::Unknown, 0, 0, 0, 0) + }; RxMetadata { offset: 0, diff --git a/vm/devices/virtio/virtio_net/src/buffers.rs b/vm/devices/virtio/virtio_net/src/buffers.rs index 2538ec8152..c29bfb5913 100644 --- a/vm/devices/virtio/virtio_net/src/buffers.rs +++ b/vm/devices/virtio/virtio_net/src/buffers.rs @@ -183,12 +183,17 @@ impl BufferAccess for VirtioWorkPool { }; let gso_type_byte: u8 = VirtioNetHeaderGso::new().with_protocol(gso_protocol).into(); - let total_hdr = - metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; + let total_hdr = metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; let csum_start = metadata.l2_len as u16 + metadata.l3_len; // TCP checksum offset within TCP header is 16. let csum_offset: u16 = 16; - (gso_type_byte, metadata.gso_size, total_hdr, csum_start, csum_offset) + ( + gso_type_byte, + metadata.gso_size, + total_hdr, + csum_start, + csum_offset, + ) } else { (0, 0, 0, 0, 0) }; From 3e217d98835557cdcf8e57f8d5a0199cc667d6cd Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 16 Apr 2026 15:46:22 -0700 Subject: [PATCH 20/28] ethtool not in PATH --- petri/burette/src/tests/network.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/petri/burette/src/tests/network.rs b/petri/burette/src/tests/network.rs index 749a7df901..7eec83b485 100644 --- a/petri/burette/src/tests/network.rs +++ b/petri/burette/src/tests/network.rs @@ -693,7 +693,7 @@ exit 1 ) -> anyhow::Result<()> { let mut sh = agent.unix_shell(); sh.chroot("/perf"); - let features = cmd!(sh, "ethtool -k {dev}") + let features = cmd!(sh, "/usr/sbin/ethtool -k {dev}") .read() .await .context("failed to run `ethtool -k` in guest (missing from petritools erofs?)")?; @@ -729,7 +729,7 @@ exit 1 // and ignore individual failures so an unsupported feature does not // break the test, but log the result for debugging. for feat in ["gro", "lro", "tso"] { - let out = cmd!(sh, "ethtool -K {dev} {feat} {onoff}") + let out = cmd!(sh, "/usr/sbin/ethtool -K {dev} {feat} {onoff}") .ignore_status() .read() .await From 6dafc8e1284346b545afb3808d8ec70a2be33235 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 17 Apr 2026 12:59:43 -0700 Subject: [PATCH 21/28] . --- vm/devices/net/net_consomme/consomme/src/unix.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/unix.rs b/vm/devices/net/net_consomme/consomme/src/unix.rs index 3cc44f8c35..fa1f18dbbd 100644 --- a/vm/devices/net/net_consomme/consomme/src/unix.rs +++ b/vm/devices/net/net_consomme/consomme/src/unix.rs @@ -14,9 +14,6 @@ // sendmsg_x (private Apple API) with a manually built msghdr_x array. #![expect(unsafe_code)] -#[cfg(target_os = "linux")] -use std::mem::size_of; - use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::UdpSocket; From c989e69a9e6e51ede973d1621b746c46481b86d7 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 17 Apr 2026 13:46:42 -0700 Subject: [PATCH 22/28] undo --- petri/burette/src/tests/network.rs | 143 ----------------------------- 1 file changed, 143 deletions(-) diff --git a/petri/burette/src/tests/network.rs b/petri/burette/src/tests/network.rs index 7eec83b485..b3a2bd717d 100644 --- a/petri/burette/src/tests/network.rs +++ b/petri/burette/src/tests/network.rs @@ -72,18 +72,6 @@ pub struct NetworkTestState { _helper_mesh: mesh_process::Mesh, /// Async driver for timers. driver: pal_async::DefaultDriver, - /// Kernel interface name of the virtio-net NIC in the guest, discovered - /// during setup. Set only for `NetBackend::Tap` + `NicBackend::VirtioNet`; - /// used by the LRO-off paired metric to toggle guest-side receive - /// offloads via `ethtool -K`. - #[cfg_attr( - not(target_os = "linux"), - expect( - dead_code, - reason = "only read by TAP + virtio-net LRO-off paired metric, which is Linux-only" - ) - )] - virtio_nic_dev: Option, } fn build_firmware(resolver: &petri::ArtifactResolver<'_>) -> petri::Firmware { @@ -312,24 +300,6 @@ impl crate::harness::WarmPerfTest for NetworkTest { .await .context("failed to prepare chroot at /perf")?; - // For TAP + virtio-net, discover the guest NIC name and verify that - // GRO (the guest-visible face of VIRTIO_NET_F_GUEST_TSO4/6, i.e. LRO) - // is advertised. Captured here so `run_once` can toggle offloads - // between iterations to produce a paired LRO-off metric. - let virtio_nic_dev = match (self.backend, self.nic) { - #[cfg(target_os = "linux")] - (NetBackend::Tap, NicBackend::VirtioNet) => { - let dev = tap::discover_virtio_nic(&agent) - .await - .context("failed to discover virtio-net NIC in guest")?; - tap::verify_lro_advertised(&agent, &dev) - .await - .context("LRO not advertised on virtio-net NIC")?; - Some(dev) - } - _ => None, - }; - Ok(NetworkTestState { vm, agent, @@ -337,7 +307,6 @@ impl crate::harness::WarmPerfTest for NetworkTest { iperf_requests: ready.requests, _helper_mesh: helper_mesh, driver: driver.clone(), - virtio_nic_dev, }) } @@ -371,31 +340,6 @@ impl crate::harness::WarmPerfTest for NetworkTest { recorder.stop()?; metrics.push(m); - // Paired LRO-off TCP RX metric for TAP + virtio-net. Disable GRO/LRO/ - // TSO on the guest NIC so the virtio-net receive path cannot coalesce - // segments, rerun the same flow, and restore offloads afterwards so - // subsequent iterations start in the default LRO-on state. - #[cfg(target_os = "linux")] - if let (NetBackend::Tap, NicBackend::VirtioNet, Some(dev)) = - (self.backend, self.nic, state.virtio_nic_dev.clone()) - { - tap::set_offloads(&state.agent, &dev, false) - .await - .context("failed to disable receive offloads on guest NIC")?; - let name = format!("{prefix}_tcp_rx_lro_off_gbps"); - recorder.start(&name)?; - let result = self - .run_iperf3(state, host_ip, base_port + 3, &name, IperfMode::TcpRx) - .await; - recorder.stop()?; - // Always try to re-enable offloads even if the RX run failed so - // later iterations are not left with offloads disabled. - let restore = tap::set_offloads(&state.agent, &dev, true).await; - let m = result.context("TCP RX (LRO off) test failed")?; - restore.context("failed to re-enable receive offloads on guest NIC")?; - metrics.push(m); - } - // UDP TX (guest sends to host) let name = format!("{prefix}_udp_tx_pps"); recorder.start(&name)?; @@ -651,93 +595,6 @@ done Ok("192.168.100.1".to_string()) } - - /// Discover the kernel interface name of the virtio-net TAP NIC in the - /// guest by matching on its MAC address. - pub(super) async fn discover_virtio_nic( - agent: &petri::pipette::PipetteClient, - ) -> anyhow::Result { - let sh = agent.unix_shell(); - // Emit the device name for the interface whose MAC matches the TAP - // NIC MAC (…:13) configured above. - let script = r#" -for dev in /sys/class/net/*/; do - name=$(basename "$dev") - mac=$(cat "$dev/address") - if [ "$mac" = "00:15:5d:12:12:13" ]; then - printf '%s' "$name" - exit 0 - fi -done -exit 1 -"#; - let name = cmd!(sh, "sh -c {script}") - .read() - .await - .context("failed to locate virtio-net NIC by MAC in guest")?; - let name = name.trim().to_string(); - anyhow::ensure!( - !name.is_empty(), - "virtio-net NIC not found in /sys/class/net by MAC" - ); - tracing::info!(dev = %name, "discovered guest virtio-net NIC"); - Ok(name) - } - - /// Assert that the guest NIC advertises GRO (the receive-side face of - /// VIRTIO_NET_F_GUEST_TSO4/6, i.e. LRO). Runs `ethtool -k` from the - /// petritools erofs via the `/perf` chroot. - pub(super) async fn verify_lro_advertised( - agent: &petri::pipette::PipetteClient, - dev: &str, - ) -> anyhow::Result<()> { - let mut sh = agent.unix_shell(); - sh.chroot("/perf"); - let features = cmd!(sh, "/usr/sbin/ethtool -k {dev}") - .read() - .await - .context("failed to run `ethtool -k` in guest (missing from petritools erofs?)")?; - tracing::info!(dev = %dev, features = %features, "guest NIC offload features"); - // Only check the `on` state; we don't require the feature to be - // fixed, just that the kernel is willing to do GRO on this NIC. - let gro_on = features - .lines() - .map(str::trim) - .any(|line| line.starts_with("generic-receive-offload:") && line.contains("on")); - anyhow::ensure!( - gro_on, - "generic-receive-offload is not advertised as `on` for {dev}; \ - LRO (VIRTIO_NET_F_GUEST_TSO) is not being exercised" - ); - Ok(()) - } - - /// Toggle the guest NIC's receive-side segmentation offloads. When - /// `enabled` is false, GRO/LRO/TSO are all disabled so the virtio-net - /// receive path cannot coalesce segments; when true, they are turned - /// back on. - pub(super) async fn set_offloads( - agent: &petri::pipette::PipetteClient, - dev: &str, - enabled: bool, - ) -> anyhow::Result<()> { - let mut sh = agent.unix_shell(); - sh.chroot("/perf"); - let onoff = if enabled { "on" } else { "off" }; - // Not every feature is supported on every kernel/NIC combination — - // `ethtool -K` will fail for fixed features. Run each independently - // and ignore individual failures so an unsupported feature does not - // break the test, but log the result for debugging. - for feat in ["gro", "lro", "tso"] { - let out = cmd!(sh, "/usr/sbin/ethtool -K {dev} {feat} {onoff}") - .ignore_status() - .read() - .await - .unwrap_or_default(); - tracing::info!(dev = %dev, feat = %feat, onoff = %onoff, out = %out, "ethtool -K"); - } - Ok(()) - } } // --------------------------------------------------------------------------- From 3a1cc49ea3e411dde34b87452da82134b5c3e756 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Thu, 23 Apr 2026 08:02:28 -0700 Subject: [PATCH 23/28] Bring consomme up to par --- .../net/net_consomme/consomme/src/tcp.rs | 72 +++++++++++++++---- vm/devices/net/net_consomme/src/lib.rs | 4 +- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/tcp.rs b/vm/devices/net/net_consomme/consomme/src/tcp.rs index 96f25ab1c7..db1d694053 100644 --- a/vm/devices/net/net_consomme/consomme/src/tcp.rs +++ b/vm/devices/net/net_consomme/consomme/src/tcp.rs @@ -492,7 +492,19 @@ struct Sender<'a, T> { } impl Sender<'_, T> { - fn send_packet(&mut self, tcp: &TcpRepr<'_>, payload: Option>) { + /// Assemble and deliver a TCP packet to the client. + /// + /// When `tso_mss` is `Some(mss)`, the payload is larger than a single + /// segment and the packet is delivered with [`ChecksumState::tso`] set so + /// that the downstream virtio-net device can present it to the guest as an + /// LRO/GSO packet. In this mode the TCP checksum is left as a + /// pseudo-header partial checksum (the guest completes it per-segment). + fn send_packet( + &mut self, + tcp: &TcpRepr<'_>, + payload: Option>, + tso_mss: Option, + ) { let buffer = &mut self.state.buffer; let mut eth_packet = EthernetFrame::new_unchecked(&mut buffer[..]); eth_packet.set_dst_addr(self.state.params.client_mac); @@ -543,11 +555,27 @@ impl Sender<'_, T> { if let Some(payload) = &payload { payload.copy_to_slice(tcp_packet.payload_mut()); } - tcp_packet.fill_checksum(&self.ft.dst.ip().into(), &self.ft.src.ip().into()); + + if tso_mss.is_none() { + // Normal single-segment packet: compute the full checksum. + tcp_packet.fill_checksum(&self.ft.dst.ip().into(), &self.ft.src.ip().into()); + } + // For TSO packets the checksum field is left as emitted by + // smoltcp (zero / pseudo-header partial). The guest driver + // will compute per-segment checksums via NEEDS_CSUM. + let n = ETHERNET_HEADER_LEN + ip_total_len; - let checksum_state = match self.ft.dst { - SocketAddr::V4(_) => ChecksumState::TCP4, - SocketAddr::V6(_) => ChecksumState::TCP6, + let checksum_state = match (self.ft.dst, tso_mss) { + (SocketAddr::V4(_), Some(mss)) => ChecksumState { + tso: Some(mss), + ..ChecksumState::TCP4 + }, + (SocketAddr::V6(_), Some(mss)) => ChecksumState { + tso: Some(mss), + ..ChecksumState::TCP6 + }, + (SocketAddr::V4(_), None) => ChecksumState::TCP4, + (SocketAddr::V6(_), None) => ChecksumState::TCP6, }; self.client.recv(&buffer[..n], &checksum_state); @@ -571,7 +599,7 @@ impl Sender<'_, T> { trace_tcp_packet(&tcp, 0, "rst xmit"); - self.send_packet(&tcp, None); + self.send_packet(&tcp, None, None); } } @@ -1005,7 +1033,7 @@ impl TcpConnectionInner { payload: &[], }; - sender.send_packet(&tcp, None); + sender.send_packet(&tcp, None, None); self.tx_send += 1; } @@ -1044,7 +1072,9 @@ impl TcpConnectionInner { // exceeding: // 1. The available buffer length. // 2. The current window. - // 3. The configured maximum segment size. + // 3. The configured maximum segment size (only when the client + // buffer is not large enough for LRO — when it is, we emit one + // large frame and let the guest segment it). // 4. The client MTU. let tx_segment_end = { let ip_header_len = match sender.ft.dst { @@ -1053,11 +1083,21 @@ impl TcpConnectionInner { }; let header_len = ETHERNET_HEADER_LEN + ip_header_len + tcp.header_len(); let mtu = rx_mtu.min(sender.state.buffer.len()); + let max_payload = mtu - header_len; + // When the client buffer can hold more than one MSS of + // payload, skip the MSS cap and fill the whole buffer — + // the packet will be delivered as an LRO/TSO frame. + // Otherwise, apply the MSS limit for normal segmentation. + let mss_limit = if max_payload > self.tx_mss { + tx_next + max_payload + } else { + tx_next + self.tx_mss + }; seq_min([ tx_payload_end, tx_window_end, - tx_next + self.tx_mss, - tx_next + (mtu - header_len), + mss_limit, + tx_next + max_payload, ]) }; @@ -1088,7 +1128,15 @@ impl TcpConnectionInner { .tx_buffer .view(payload_start..payload_start + payload_len); - sender.send_packet(&tcp, Some(payload)); + // When the payload exceeds a single MSS, deliver the frame as a + // TSO/LRO packet so the guest can re-segment it. + let tso_mss = if payload_len > self.tx_mss { + Some(self.tx_mss as u16) + } else { + None + }; + + sender.send_packet(&tcp, Some(payload), tso_mss); self.tx_send = tx_next; self.needs_ack = false; } @@ -1139,7 +1187,7 @@ impl TcpConnectionInner { trace_tcp_packet(&tcp, 0, "ack"); - sender.send_packet(&tcp, None); + sender.send_packet(&tcp, None, None); } fn handle_listen_syn( diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index bd0372c4d5..19ee8fe5ed 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -578,7 +578,7 @@ fn parse_rx_header_lengths(data: &[u8], checksum: &ChecksumState) -> (L3Protocol let l4_start = l2_len as usize + l3_len as usize; // Derive TCP header length from data offset field if TCP let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { - let data_offset = (data[l4_start + 12] >> 4) as u8 * 4; + let data_offset = (data[l4_start + 12] >> 4) * 4; data_offset.max(20) } else { 0 @@ -589,7 +589,7 @@ fn parse_rx_header_lengths(data: &[u8], checksum: &ChecksumState) -> (L3Protocol let l3_len: u16 = 40; // Base IPv6 header; extension headers not handled let l4_start = l2_len as usize + l3_len as usize; let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { - let data_offset = (data[l4_start + 12] >> 4) as u8 * 4; + let data_offset = (data[l4_start + 12] >> 4) * 4; data_offset.max(20) } else { 0 From 0026368ca8695b8718ef1f4d912f9a2533abb4d7 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Mon, 27 Apr 2026 11:12:52 -0700 Subject: [PATCH 24/28] review --- .../net/net_consomme/consomme/src/tcp.rs | 2 +- vm/devices/net/net_consomme/src/lib.rs | 5 ++++- vm/devices/net/net_tap/src/lib.rs | 8 +++++--- vm/devices/virtio/virtio_net/src/buffers.rs | 18 +++++++++++++++--- vm/devices/virtio/virtio_net/src/lib.rs | 15 +++++++++++++-- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/tcp.rs b/vm/devices/net/net_consomme/consomme/src/tcp.rs index db1d694053..68d7b36e44 100644 --- a/vm/devices/net/net_consomme/consomme/src/tcp.rs +++ b/vm/devices/net/net_consomme/consomme/src/tcp.rs @@ -1131,7 +1131,7 @@ impl TcpConnectionInner { // When the payload exceeds a single MSS, deliver the frame as a // TSO/LRO packet so the guest can re-segment it. let tso_mss = if payload_len > self.tx_mss { - Some(self.tx_mss as u16) + Some(self.tx_mss.min(u16::MAX as usize) as u16) } else { None }; diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 19ee8fe5ed..46dba438ad 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -586,7 +586,10 @@ fn parse_rx_header_lengths(data: &[u8], checksum: &ChecksumState) -> (L3Protocol (L3Protocol::Ipv4, l2_len, l3_len, l4_len) } ETHERTYPE_IPV6 if data.len() >= l2_len as usize + 40 => { - let l3_len: u16 = 40; // Base IPv6 header; extension headers not handled + // Base IPv6 header only. Extension headers are not parsed, but + // this is safe because consomme never generates IPv6 extension + // headers on the receive path. + let l3_len: u16 = 40; let l4_start = l2_len as usize + l3_len as usize; let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { let data_offset = (data[l4_start + 12] >> 4) * 4; diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index 07cf52f190..fb5c101553 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -506,14 +506,15 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { (RxChecksumState::Unknown, RxChecksumState::Unknown) }; - let l4_protocol = match hdr.gso_type.protocol() { + let gso_protocol = hdr.gso_type.protocol(); + + let l4_protocol = match gso_protocol { VirtioNetHdrGsoProtocol::TCPV4 | VirtioNetHdrGsoProtocol::TCPV6 => L4Protocol::Tcp, VirtioNetHdrGsoProtocol::UDP | VirtioNetHdrGsoProtocol::UDP_L4 => L4Protocol::Udp, _ => L4Protocol::Unknown, }; // Extract GSO metadata when the kernel delivers a coalesced packet. - let gso_protocol = hdr.gso_type.protocol(); let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = if hdr.gso_size > 0 && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 || gso_protocol == VirtioNetHdrGsoProtocol::TCPV6) @@ -525,7 +526,7 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { }; // csum_start = l2_len + l3_len; we assume standard Ethernet (14 bytes) // unless csum_start indicates otherwise. - let l2 = if hdr.csum_start > 14 { 14u8 } else { 0 }; + let l2 = if hdr.csum_start >= 14 { 14u8 } else { 0 }; let l3 = if l2 > 0 { hdr.csum_start - l2 as u16 } else { @@ -698,6 +699,7 @@ mod tests { assert_eq!(meta.l2_len, 14); assert_eq!(meta.l3_len, 40); assert_eq!(meta.l4_len, 20); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } #[test] diff --git a/vm/devices/virtio/virtio_net/src/buffers.rs b/vm/devices/virtio/virtio_net/src/buffers.rs index c29bfb5913..d5403e5161 100644 --- a/vm/devices/virtio/virtio_net/src/buffers.rs +++ b/vm/devices/virtio/virtio_net/src/buffers.rs @@ -30,6 +30,10 @@ pub struct VirtioWorkPool { mem: GuestMemory, #[inspect(skip)] rx_packets: Vec>, + /// Whether the guest negotiated VIRTIO_NET_F_GUEST_TSO4. + guest_tso4: bool, + /// Whether the guest negotiated VIRTIO_NET_F_GUEST_TSO6. + guest_tso6: bool, } impl VirtioWorkPool { @@ -41,10 +45,12 @@ impl VirtioWorkPool { } /// Create a new instance. - pub fn new(mem: GuestMemory, queue_size: u16) -> Self { + pub fn new(mem: GuestMemory, queue_size: u16, guest_tso4: bool, guest_tso6: bool) -> Self { Self { mem, rx_packets: (0..queue_size).map(|_| None).collect(), + guest_tso4, + guest_tso6, } } @@ -173,9 +179,15 @@ impl BufferAccess for VirtioWorkPool { let data_valid = metadata.ip_checksum.is_valid() && metadata.l4_checksum.is_valid(); let flags = VirtioNetHeaderFlags::new().with_data_valid(data_valid); - // Build GSO fields when the backend indicates a large/coalesced packet. + // Build GSO fields when the backend indicates a large/coalesced packet + // and the guest has negotiated the corresponding GUEST_TSO feature. + let gso_allowed = match metadata.l3_protocol { + L3Protocol::Ipv4 => self.guest_tso4, + L3Protocol::Ipv6 => self.guest_tso6, + L3Protocol::Unknown => false, + }; let (gso_type, gso_size, hdr_len, csum_start, csum_offset) = - if metadata.gso_size > 0 && metadata.l2_len > 0 && metadata.l3_len > 0 { + if metadata.gso_size > 0 && metadata.l2_len > 0 && metadata.l3_len > 0 && gso_allowed { let gso_protocol = match metadata.l3_protocol { L3Protocol::Ipv4 => VirtioNetHeaderGsoProtocol::TCPV4, L3Protocol::Ipv6 => VirtioNetHeaderGsoProtocol::TCPV6, diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index d333efd65f..183a9434be 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -507,10 +507,20 @@ struct ActiveState { } impl ActiveState { - fn new(mem: GuestMemory, rx_queue_size: u16, tx_queue_size: u16) -> Self { + fn new( + mem: GuestMemory, + rx_queue_size: u16, + tx_queue_size: u16, + negotiated_features: NetworkFeaturesBank0, + ) -> Self { Self { pending_tx_packets: (0..tx_queue_size).map(|_| None).collect(), - pending_rx_packets: VirtioWorkPool::new(mem, rx_queue_size), + pending_rx_packets: VirtioWorkPool::new( + mem, + rx_queue_size, + negotiated_features.guest_tso4(), + negotiated_features.guest_tso6(), + ), data: ProcessingData::new(rx_queue_size, tx_queue_size), stats: Default::default(), } @@ -637,6 +647,7 @@ impl Device { guest_memory.clone(), virtio_state.rx_queue_size, virtio_state.tx_queue_size, + negotiated_features, ); let worker = Worker { virtio_state, From 69d6fa78a4ce367e90d6994b6c54aa8f4b648534 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 1 May 2026 10:51:33 -0700 Subject: [PATCH 25/28] skip double computing the checksum --- vm/devices/net/net_consomme/consomme/src/tcp.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vm/devices/net/net_consomme/consomme/src/tcp.rs b/vm/devices/net/net_consomme/consomme/src/tcp.rs index a4eba3bc1e..a007f5fa2a 100644 --- a/vm/devices/net/net_consomme/consomme/src/tcp.rs +++ b/vm/devices/net/net_consomme/consomme/src/tcp.rs @@ -21,6 +21,7 @@ use pal_async::driver::Driver; use pal_async::interest::PollEvents; use pal_async::socket::PollReady; use pal_async::socket::PolledSocket; +use smoltcp::phy::Checksum; use smoltcp::phy::ChecksumCapabilities; use smoltcp::wire::ETHERNET_HEADER_LEN; use smoltcp::wire::EthernetFrame; @@ -523,13 +524,11 @@ impl Sender<'_, T> { let dst_ip_addr: IpAddress = self.ft.dst.ip().into(); let src_ip_addr: IpAddress = self.ft.src.ip().into(); let mut tcp_packet = TcpPacket::new_unchecked(tcp_payload_buf); - tcp.emit( - &mut tcp_packet, - &dst_ip_addr, - &src_ip_addr, - &ChecksumCapabilities::default(), - ); - + // Skip the TCP checksum during emit--fill_checksum below recomputes + // it after the payload has been copied in. + let mut caps = ChecksumCapabilities::default(); + caps.tcp = Checksum::None; + tcp.emit(&mut tcp_packet, &dst_ip_addr, &src_ip_addr, &caps); // Copy payload into TCP packet if let Some(payload) = &payload { payload.copy_to_slice(tcp_packet.payload_mut()); From dbb5a4c39b400093385527960dada0376cba8843 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 1 May 2026 11:38:53 -0700 Subject: [PATCH 26/28] build fix Co-authored-by: Copilot --- vm/devices/net/net_mana/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 4483ea19c7..7e38a070e8 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -44,6 +44,7 @@ use net_backend::BackendQueueStats; use net_backend::BufferAccess; use net_backend::Endpoint; use net_backend::EndpointAction; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::MultiQueueSupport; use net_backend::Queue; @@ -974,6 +975,11 @@ impl Queue for ManaQueue { ip_checksum, l4_checksum, l4_protocol, + l3_protocol: L3Protocol::Unknown, + l2_len: 0, + l3_len: 0, + l4_len: 0, + gso_size: 0, }, ); if rx.bounced_len_with_padding > 0 { From ab47b5f5e99bbe5581581f444c6298e26677b280 Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 1 May 2026 14:50:45 -0700 Subject: [PATCH 27/28] review Co-authored-by: Copilot --- vm/devices/net/net_consomme/src/lib.rs | 11 +- vm/devices/net/net_tap/src/lib.rs | 115 ++++++++++++++++---- vm/devices/virtio/virtio_net/src/buffers.rs | 63 +++++++---- 3 files changed, 144 insertions(+), 45 deletions(-) diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 0cf2a302c7..8e2d115b9d 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -617,12 +617,19 @@ impl consomme::Client for Client<'_> { &RxMetadata { offset: 0, len: data.len(), - ip_checksum: if checksum.ipv4 { + ip_checksum: if checksum.tso.is_some() { + // TSO packets have partial/coalesced checksums; + // the guest must recompute per-segment checksums + // via NEEDS_CSUM. + RxChecksumState::Unknown + } else if checksum.ipv4 { RxChecksumState::Good } else { RxChecksumState::Unknown }, - l4_checksum: if checksum.tcp || checksum.udp { + l4_checksum: if checksum.tso.is_some() { + RxChecksumState::Unknown + } else if checksum.tcp || checksum.udp { RxChecksumState::Good } else { RxChecksumState::Unknown diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index fb5c101553..bd4248a03c 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -251,9 +251,23 @@ impl Queue for TapQueue { } let (hdr, _) = VirtioNetHdr::read_from_prefix(&self.buffer[..read_len]).unwrap(); - let rx_meta = parse_vnet_hdr(&hdr); let frame_start = size_of::(); let frame_len = read_len - size_of::(); + let frame = &self.buffer[frame_start..read_len]; + let rx_meta = parse_vnet_hdr(&hdr, frame); + + // With TUN_F_TSO4/6 the kernel may deliver GRO-coalesced + // frames larger than the guest RX buffer. Drop them + // gracefully instead of panicking in write_packet. + if frame_len > pool.capacity(rx) as usize { + tracing::warn!( + frame_len, + capacity = pool.capacity(rx), + "dropping rx packet: frame exceeds buffer capacity" + ); + continue; + } + pool.write_packet( rx, &RxMetadata { @@ -487,20 +501,55 @@ fn build_vnet_hdr(meta: &TxMetadata) -> VirtioNetHdr { } } +/// Parse the EtherType from the start of an Ethernet frame. +/// +/// Returns `(l2_len, is_ipv4, is_ipv6)`. Handles 802.1Q VLAN tags. +fn parse_ethertype(frame: &[u8]) -> (u8, bool, bool) { + const ETHERTYPE_IPV4: u16 = 0x0800; + const ETHERTYPE_IPV6: u16 = 0x86DD; + const ETHERTYPE_VLAN: u16 = 0x8100; + + if frame.len() < 14 { + return (0, false, false); + } + + let ethertype = u16::from_be_bytes([frame[12], frame[13]]); + if ethertype == ETHERTYPE_VLAN { + // VLAN-tagged: real EtherType is 4 bytes further. + if frame.len() < 18 { + return (0, false, false); + } + let inner = u16::from_be_bytes([frame[16], frame[17]]); + (18, inner == ETHERTYPE_IPV4, inner == ETHERTYPE_IPV6) + } else { + (14, ethertype == ETHERTYPE_IPV4, ethertype == ETHERTYPE_IPV6) + } +} + /// Parse a `VirtioNetHdr` from the TAP device into receive metadata. /// /// With `TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6` enabled, the kernel may /// deliver large coalesced packets with `NEEDS_CSUM` set and a non-NONE /// `gso_type`. We translate these into `RxMetadata` GSO fields so the /// virtio-net device can pass them to the guest as LRO packets. -fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { +/// +/// `frame` is the Ethernet frame bytes (after the vnet header) used to +/// parse the actual L2 header length, including VLAN tags. +fn parse_vnet_hdr(hdr: &VirtioNetHdr, frame: &[u8]) -> RxMetadata { let (ip_checksum, l4_checksum) = if hdr.flags.data_valid() { (RxChecksumState::Good, RxChecksumState::Good) + } else if hdr.flags.needs_csum() && hdr.gso_size > 0 { + // GSO + NEEDS_CSUM: the L4 checksum is partial (pseudo-header + // only). Report as Unknown so the virtio layer does not set + // DATA_VALID — it will set NEEDS_CSUM in the virtio header + // instead, and the guest will compute per-segment checksums. + (RxChecksumState::Unknown, RxChecksumState::Unknown) } else if hdr.flags.needs_csum() { - // NEEDS_CSUM means the data is valid but the L4 checksum in the - // header is incomplete (partial). For our purposes treat the - // checksums as good — the guest will be told via NEEDS_CSUM in - // the virtio header to complete them. + // Non-GSO + NEEDS_CSUM: the data integrity is fine but the L4 + // checksum field is partial. Since RxMetadata has no way to + // propagate NEEDS_CSUM for non-GSO packets, report as Good so + // the virtio header gets DATA_VALID and the guest accepts the + // packet. (RxChecksumState::Good, RxChecksumState::Good) } else { (RxChecksumState::Unknown, RxChecksumState::Unknown) @@ -514,6 +563,10 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { _ => L4Protocol::Unknown, }; + // Parse the actual Ethernet header to determine l2_len, handling + // VLAN tags. This mirrors the TX path's parse_ethertype logic. + let (parsed_l2_len, _, _) = parse_ethertype(frame); + // Extract GSO metadata when the kernel delivers a coalesced packet. let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = if hdr.gso_size > 0 && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 @@ -524,10 +577,8 @@ fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { } else { L3Protocol::Ipv6 }; - // csum_start = l2_len + l3_len; we assume standard Ethernet (14 bytes) - // unless csum_start indicates otherwise. - let l2 = if hdr.csum_start >= 14 { 14u8 } else { 0 }; - let l3 = if l2 > 0 { + let l2 = parsed_l2_len; + let l3 = if l2 > 0 && hdr.csum_start >= l2 as u16 { hdr.csum_start - l2 as u16 } else { 0 @@ -634,6 +685,15 @@ mod tests { assert_eq!(hdr.gso_type.protocol(), VirtioNetHdrGsoProtocol::NONE); } + // Minimal 14-byte Ethernet header with IPv4 EtherType for use in tests. + const ETH_IPV4: [u8; 14] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x00, + ]; + // Minimal 14-byte Ethernet header with IPv6 EtherType for use in tests. + const ETH_IPV6: [u8; 14] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x86, 0xDD, + ]; + #[test] fn rx_metadata_from_vnet_hdr_valid() { let hdr = VirtioNetHdr { @@ -641,27 +701,44 @@ mod tests { gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV4), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV4); assert_eq!(meta.ip_checksum, RxChecksumState::Good); assert_eq!(meta.l4_checksum, RxChecksumState::Good); assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } #[test] - fn rx_metadata_from_vnet_hdr_needs_csum_treated_as_good() { - // With TUN_F_CSUM enabled, NEEDS_CSUM means data is valid but - // checksum is partial — treat as Good for our purposes. + fn rx_metadata_from_vnet_hdr_needs_csum_non_gso_treated_as_good() { + // Non-GSO + NEEDS_CSUM: data integrity is fine, L4 checksum is + // partial. Report as Good so DATA_VALID is set — RxMetadata has + // no way to propagate NEEDS_CSUM for non-GSO packets. let hdr = VirtioNetHdr { flags: VirtioNetHdrFlags::new().with_needs_csum(true), gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); assert_eq!(meta.ip_checksum, RxChecksumState::Good); assert_eq!(meta.l4_checksum, RxChecksumState::Good); assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } + #[test] + fn rx_metadata_from_vnet_hdr_needs_csum_gso_treated_as_unknown() { + // GSO + NEEDS_CSUM: report as Unknown so DATA_VALID is not set. + // The virtio layer will set NEEDS_CSUM in the header instead. + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), + gso_size: 1440, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); + assert_eq!(meta.ip_checksum, RxChecksumState::Unknown); + assert_eq!(meta.l4_checksum, RxChecksumState::Unknown); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); + } + #[test] fn rx_metadata_from_vnet_hdr_gso_tcpv4() { let hdr = VirtioNetHdr { @@ -673,7 +750,7 @@ mod tests { csum_offset: 16, ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV4); assert_eq!(meta.l3_protocol, L3Protocol::Ipv4); assert_eq!(meta.gso_size, 1460); assert_eq!(meta.l2_len, 14); @@ -693,7 +770,7 @@ mod tests { csum_offset: 16, ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); assert_eq!(meta.l3_protocol, L3Protocol::Ipv6); assert_eq!(meta.gso_size, 1440); assert_eq!(meta.l2_len, 14); @@ -705,7 +782,7 @@ mod tests { #[test] fn rx_metadata_from_vnet_hdr_none() { let hdr = VirtioNetHdr::default(); - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, &[]); assert_eq!(meta.ip_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_protocol, L4Protocol::Unknown); @@ -718,7 +795,7 @@ mod tests { gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::UDP), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, &[]); assert_eq!(meta.l4_protocol, L4Protocol::Udp); } diff --git a/vm/devices/virtio/virtio_net/src/buffers.rs b/vm/devices/virtio/virtio_net/src/buffers.rs index d5403e5161..8d69478d32 100644 --- a/vm/devices/virtio/virtio_net/src/buffers.rs +++ b/vm/devices/virtio/virtio_net/src/buffers.rs @@ -186,34 +186,49 @@ impl BufferAccess for VirtioWorkPool { L3Protocol::Ipv6 => self.guest_tso6, L3Protocol::Unknown => false, }; - let (gso_type, gso_size, hdr_len, csum_start, csum_offset) = - if metadata.gso_size > 0 && metadata.l2_len > 0 && metadata.l3_len > 0 && gso_allowed { - let gso_protocol = match metadata.l3_protocol { - L3Protocol::Ipv4 => VirtioNetHeaderGsoProtocol::TCPV4, - L3Protocol::Ipv6 => VirtioNetHeaderGsoProtocol::TCPV6, - L3Protocol::Unknown => VirtioNetHeaderGsoProtocol::NONE, - }; - let gso_type_byte: u8 = - VirtioNetHeaderGso::new().with_protocol(gso_protocol).into(); - let total_hdr = metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; - let csum_start = metadata.l2_len as u16 + metadata.l3_len; - // TCP checksum offset within TCP header is 16. - let csum_offset: u16 = 16; - ( - gso_type_byte, - metadata.gso_size, - total_hdr, - csum_start, - csum_offset, - ) - } else { - (0, 0, 0, 0, 0) + let (gso_type, gso_size, hdr_len, csum_start, csum_offset) = if metadata.gso_size > 0 + && metadata.l2_len > 0 + && metadata.l3_len > 0 + && metadata.l4_len > 0 + && gso_allowed + { + let gso_protocol = match metadata.l3_protocol { + L3Protocol::Ipv4 => VirtioNetHeaderGsoProtocol::TCPV4, + L3Protocol::Ipv6 => VirtioNetHeaderGsoProtocol::TCPV6, + L3Protocol::Unknown => VirtioNetHeaderGsoProtocol::NONE, }; + let gso_type_byte: u8 = VirtioNetHeaderGso::new().with_protocol(gso_protocol).into(); + let total_hdr = metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; + let csum_start = metadata.l2_len as u16 + metadata.l3_len; + // TCP checksum offset within TCP header is 16. + let csum_offset: u16 = 16; + ( + gso_type_byte, + metadata.gso_size, + total_hdr, + csum_start, + csum_offset, + ) + } else { + if metadata.gso_size > 0 { + tracelimit::warn_ratelimited!( + gso_size = metadata.gso_size, + l2_len = metadata.l2_len, + l3_len = metadata.l3_len, + l4_len = metadata.l4_len, + ?gso_allowed, + "cannot emit GSO metadata: missing header lengths or guest feature" + ); + } + (0, 0, 0, 0, 0) + }; // When GSO is active, set NEEDS_CSUM so the guest computes - // per-segment checksums. + // per-segment checksums, and clear DATA_VALID to avoid the + // contradictory combination that could cause the guest to + // skip required per-segment checksum computation. let flags = if gso_size > 0 { - flags.with_needs_csum(true) + flags.with_needs_csum(true).with_data_valid(false) } else { flags }; From f0c517da93acee055ae3049f8a092dc457e0406c Mon Sep 17 00:00:00 2001 From: Daman Mulye Date: Fri, 1 May 2026 15:13:20 -0700 Subject: [PATCH 28/28] fmt --- vm/devices/net/net_tap/src/lib.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index bd4248a03c..8012532d5c 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -686,13 +686,9 @@ mod tests { } // Minimal 14-byte Ethernet header with IPv4 EtherType for use in tests. - const ETH_IPV4: [u8; 14] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x00, - ]; + const ETH_IPV4: [u8; 14] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x00]; // Minimal 14-byte Ethernet header with IPv6 EtherType for use in tests. - const ETH_IPV6: [u8; 14] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x86, 0xDD, - ]; + const ETH_IPV6: [u8; 14] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x86, 0xDD]; #[test] fn rx_metadata_from_vnet_hdr_valid() {