diff --git a/vm/devices/net/net_backend/src/lib.rs b/vm/devices/net/net_backend/src/lib.rs index fb838beb61..8e04d2e8ba 100644 --- a/vm/devices/net/net_backend/src/lib.rs +++ b/vm/devices/net/net_backend/src/lib.rs @@ -308,6 +308,17 @@ pub struct RxMetadata { pub l4_checksum: RxChecksumState, /// The L4 protocol. pub l4_protocol: L4Protocol, + /// The L3 protocol (IPv4/IPv6). Used for GSO/LRO metadata. + pub l3_protocol: L3Protocol, + /// L2 (Ethernet) header length in bytes (e.g. 14, or 18 with VLAN). + pub l2_len: u8, + /// L3 (IP) header length in bytes. + pub l3_len: u16, + /// L4 (TCP/UDP) header length in bytes. + pub l4_len: u8, + /// If non-zero, this is a GSO/LRO packet and this value is the MSS + /// (maximum segment size) that should be advertised to the guest. + pub gso_size: u16, } impl Default for RxMetadata { @@ -318,6 +329,11 @@ impl Default for RxMetadata { ip_checksum: RxChecksumState::Unknown, l4_checksum: RxChecksumState::Unknown, l4_protocol: L4Protocol::Unknown, + l3_protocol: L3Protocol::Unknown, + l2_len: 0, + l3_len: 0, + l4_len: 0, + gso_size: 0, } } } diff --git a/vm/devices/net/net_consomme/consomme/src/tcp.rs b/vm/devices/net/net_consomme/consomme/src/tcp.rs index 51fd341770..a007f5fa2a 100644 --- a/vm/devices/net/net_consomme/consomme/src/tcp.rs +++ b/vm/devices/net/net_consomme/consomme/src/tcp.rs @@ -21,6 +21,7 @@ use pal_async::driver::Driver; use pal_async::interest::PollEvents; use pal_async::socket::PollReady; use pal_async::socket::PolledSocket; +use smoltcp::phy::Checksum; use smoltcp::phy::ChecksumCapabilities; use smoltcp::wire::ETHERNET_HEADER_LEN; use smoltcp::wire::EthernetFrame; @@ -471,7 +472,19 @@ struct Sender<'a, T> { } impl Sender<'_, T> { - fn send_packet(&mut self, tcp: &TcpRepr<'_>, payload: Option>) { + /// Assemble and deliver a TCP packet to the client. + /// + /// When `tso_mss` is `Some(mss)`, the payload is larger than a single + /// segment and the packet is delivered with [`ChecksumState::tso`] set so + /// that the downstream virtio-net device can present it to the guest as an + /// LRO/GSO packet. In this mode the TCP checksum is left as a + /// pseudo-header partial checksum (the guest completes it per-segment). + fn send_packet( + &mut self, + tcp: &TcpRepr<'_>, + payload: Option>, + tso_mss: Option, + ) { let buffer = &mut self.state.buffer; let mut eth_packet = EthernetFrame::new_unchecked(&mut buffer[..]); eth_packet.set_dst_addr(self.state.params.client_mac); @@ -511,22 +524,36 @@ impl Sender<'_, T> { let dst_ip_addr: IpAddress = self.ft.dst.ip().into(); let src_ip_addr: IpAddress = self.ft.src.ip().into(); let mut tcp_packet = TcpPacket::new_unchecked(tcp_payload_buf); - tcp.emit( - &mut tcp_packet, - &dst_ip_addr, - &src_ip_addr, - &ChecksumCapabilities::default(), - ); - + // Skip the TCP checksum during emit--fill_checksum below recomputes + // it after the payload has been copied in. + let mut caps = ChecksumCapabilities::default(); + caps.tcp = Checksum::None; + tcp.emit(&mut tcp_packet, &dst_ip_addr, &src_ip_addr, &caps); // Copy payload into TCP packet if let Some(payload) = &payload { payload.copy_to_slice(tcp_packet.payload_mut()); } - tcp_packet.fill_checksum(&self.ft.dst.ip().into(), &self.ft.src.ip().into()); + + if tso_mss.is_none() { + // Normal single-segment packet: compute the full checksum. + tcp_packet.fill_checksum(&self.ft.dst.ip().into(), &self.ft.src.ip().into()); + } + // For TSO packets the checksum field is left as emitted by + // smoltcp (zero / pseudo-header partial). The guest driver + // will compute per-segment checksums via NEEDS_CSUM. + let n = ETHERNET_HEADER_LEN + ip_total_len; - let checksum_state = match self.ft.dst { - SocketAddr::V4(_) => ChecksumState::TCP4, - SocketAddr::V6(_) => ChecksumState::TCP6, + let checksum_state = match (self.ft.dst, tso_mss) { + (SocketAddr::V4(_), Some(mss)) => ChecksumState { + tso: Some(mss), + ..ChecksumState::TCP4 + }, + (SocketAddr::V6(_), Some(mss)) => ChecksumState { + tso: Some(mss), + ..ChecksumState::TCP6 + }, + (SocketAddr::V4(_), None) => ChecksumState::TCP4, + (SocketAddr::V6(_), None) => ChecksumState::TCP6, }; self.client.recv(&buffer[..n], &checksum_state); @@ -550,7 +577,7 @@ impl Sender<'_, T> { trace_tcp_packet(&tcp, 0, "rst xmit"); - self.send_packet(&tcp, None); + self.send_packet(&tcp, None, None); } } @@ -984,7 +1011,7 @@ impl TcpConnectionInner { payload: &[], }; - sender.send_packet(&tcp, None); + sender.send_packet(&tcp, None, None); self.tx_send += 1; } @@ -1023,7 +1050,9 @@ impl TcpConnectionInner { // exceeding: // 1. The available buffer length. // 2. The current window. - // 3. The configured maximum segment size. + // 3. The configured maximum segment size (only when the client + // buffer is not large enough for LRO — when it is, we emit one + // large frame and let the guest segment it). // 4. The client MTU. let tx_segment_end = { let ip_header_len = match sender.ft.dst { @@ -1032,11 +1061,21 @@ impl TcpConnectionInner { }; let header_len = ETHERNET_HEADER_LEN + ip_header_len + tcp.header_len(); let mtu = rx_mtu.min(sender.state.buffer.len()); + let max_payload = mtu - header_len; + // When the client buffer can hold more than one MSS of + // payload, skip the MSS cap and fill the whole buffer — + // the packet will be delivered as an LRO/TSO frame. + // Otherwise, apply the MSS limit for normal segmentation. + let mss_limit = if max_payload > self.tx_mss { + tx_next + max_payload + } else { + tx_next + self.tx_mss + }; seq_min([ tx_payload_end, tx_window_end, - tx_next + self.tx_mss, - tx_next + (mtu - header_len), + mss_limit, + tx_next + max_payload, ]) }; @@ -1067,7 +1106,15 @@ impl TcpConnectionInner { .tx_buffer .view(payload_start..payload_start + payload_len); - sender.send_packet(&tcp, Some(payload)); + // When the payload exceeds a single MSS, deliver the frame as a + // TSO/LRO packet so the guest can re-segment it. + let tso_mss = if payload_len > self.tx_mss { + Some(self.tx_mss.min(u16::MAX as usize) as u16) + } else { + None + }; + + sender.send_packet(&tcp, Some(payload), tso_mss); self.tx_send = tx_next; self.needs_ack = false; } @@ -1118,7 +1165,7 @@ impl TcpConnectionInner { trace_tcp_packet(&tcp, 0, "ack"); - sender.send_packet(&tcp, None); + sender.send_packet(&tcp, None, None); } fn handle_listen_syn( diff --git a/vm/devices/net/net_consomme/src/lib.rs b/vm/devices/net/net_consomme/src/lib.rs index 2dc2294246..8e2d115b9d 100644 --- a/vm/devices/net/net_consomme/src/lib.rs +++ b/vm/devices/net/net_consomme/src/lib.rs @@ -17,6 +17,7 @@ use mesh::rpc::Rpc; use mesh::rpc::RpcError; use mesh::rpc::RpcSend; use net_backend::BufferAccess; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::QueueConfig; use net_backend::RssConfig; @@ -596,28 +597,49 @@ impl consomme::Client for Client<'_> { }; let max = self.pool.capacity(rx_id) as usize; if data.len() <= max { + let l4_protocol = if checksum.tcp { + L4Protocol::Tcp + } else if checksum.udp { + L4Protocol::Udp + } else { + L4Protocol::Unknown + }; + + // Determine L3 protocol and header lengths for GSO metadata. + // Parse the Ethernet header to find IP version, then derive + // l2_len and l3_len from the packet. + let (l3_protocol, l2_len, l3_len, l4_len) = parse_rx_header_lengths(data, checksum); + + let gso_size = checksum.tso.unwrap_or(0); + self.pool.write_packet( rx_id, &RxMetadata { offset: 0, len: data.len(), - ip_checksum: if checksum.ipv4 { + ip_checksum: if checksum.tso.is_some() { + // TSO packets have partial/coalesced checksums; + // the guest must recompute per-segment checksums + // via NEEDS_CSUM. + RxChecksumState::Unknown + } else if checksum.ipv4 { RxChecksumState::Good } else { RxChecksumState::Unknown }, - l4_checksum: if checksum.tcp || checksum.udp { + l4_checksum: if checksum.tso.is_some() { + RxChecksumState::Unknown + } else if checksum.tcp || checksum.udp { RxChecksumState::Good } else { RxChecksumState::Unknown }, - l4_protocol: if checksum.tcp { - L4Protocol::Tcp - } else if checksum.udp { - L4Protocol::Udp - } else { - L4Protocol::Unknown - }, + l4_protocol, + l3_protocol, + gso_size, + l2_len, + l3_len, + l4_len, }, data, ); @@ -636,3 +658,50 @@ impl consomme::Client for Client<'_> { } } } + +/// Parse an Ethernet frame to extract L3 protocol, l2_len, l3_len, and l4_len. +/// +/// Used to populate `RxMetadata` GSO fields on the receive path so that +/// the virtio-net device can construct proper virtio headers for LRO packets. +fn parse_rx_header_lengths(data: &[u8], checksum: &ChecksumState) -> (L3Protocol, u8, u16, u8) { + const ETHERTYPE_IPV4: u16 = 0x0800; + const ETHERTYPE_IPV6: u16 = 0x86DD; + + if data.len() < 14 { + return (L3Protocol::Unknown, 0, 0, 0); + } + + let ethertype = u16::from_be_bytes([data[12], data[13]]); + let l2_len: u8 = 14; + + match ethertype { + ETHERTYPE_IPV4 if checksum.ipv4 && data.len() >= l2_len as usize + 20 => { + let ihl = (data[l2_len as usize] & 0x0f) as u16 * 4; + let l3_len = ihl.max(20); + let l4_start = l2_len as usize + l3_len as usize; + // Derive TCP header length from data offset field if TCP + let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { + let data_offset = (data[l4_start + 12] >> 4) * 4; + data_offset.max(20) + } else { + 0 + }; + (L3Protocol::Ipv4, l2_len, l3_len, l4_len) + } + ETHERTYPE_IPV6 if data.len() >= l2_len as usize + 40 => { + // Base IPv6 header only. Extension headers are not parsed, but + // this is safe because consomme never generates IPv6 extension + // headers on the receive path. + let l3_len: u16 = 40; + let l4_start = l2_len as usize + l3_len as usize; + let l4_len = if checksum.tcp && data.len() >= l4_start + 20 { + let data_offset = (data[l4_start + 12] >> 4) * 4; + data_offset.max(20) + } else { + 0 + }; + (L3Protocol::Ipv6, l2_len, l3_len, l4_len) + } + _ => (L3Protocol::Unknown, 0, 0, 0), + } +} diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 4483ea19c7..7e38a070e8 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -44,6 +44,7 @@ use net_backend::BackendQueueStats; use net_backend::BufferAccess; use net_backend::Endpoint; use net_backend::EndpointAction; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::MultiQueueSupport; use net_backend::Queue; @@ -974,6 +975,11 @@ impl Queue for ManaQueue { ip_checksum, l4_checksum, l4_protocol, + l3_protocol: L3Protocol::Unknown, + l2_len: 0, + l3_len: 0, + l4_len: 0, + gso_size: 0, }, ); if rx.bounced_len_with_padding > 0 { diff --git a/vm/devices/net/net_tap/src/lib.rs b/vm/devices/net/net_tap/src/lib.rs index da9bcb69c6..8012532d5c 100644 --- a/vm/devices/net/net_tap/src/lib.rs +++ b/vm/devices/net/net_tap/src/lib.rs @@ -14,6 +14,7 @@ use futures::io::AsyncRead; use inspect::InspectMut; use net_backend::BufferAccess; use net_backend::Endpoint; +use net_backend::L3Protocol; use net_backend::L4Protocol; use net_backend::Queue; use net_backend::QueueConfig; @@ -116,25 +117,20 @@ pub struct TapEndpoint { impl TapEndpoint { pub fn new(tap: tap::Tap) -> Result { - // Do not enable any RX offloads (TUN_F_CSUM, TUN_F_TSO*, etc.). + // Enable RX offloads so the kernel can deliver large coalesced + // (GRO/LRO) TCP packets instead of segmenting them. This reduces + // per-packet overhead and improves throughput when the guest has + // negotiated VIRTIO_NET_F_GUEST_TSO4/6. // - // The TUN_F_* flags are the TAP equivalent of VIRTIO_NET_F_GUEST_*: - // they tell the kernel that our reader can handle partial checksums - // (NEEDS_CSUM) and unsegmented GSO packets. Since net_backend's - // RxMetadata has no way to represent "checksum needs to be completed" - // (only Good/Bad/Unknown), and no concept of receive-side GRO/RSC, - // accepting such packets would force us to either lie about checksum - // state or complete checksums in software. + // TUN_F_CSUM (1) — we can handle NEEDS_CSUM (partial checksum) + // TUN_F_TSO4 (2) — we can handle TSOv4 (large IPv4/TCP packets) + // TUN_F_TSO6 (4) — we can handle TSOv6 (large IPv6/TCP packets) // - // With offloads set to 0, the kernel completes all checksums and - // segments all GSO packets before delivering them to us. This is - // correct and simple. The TX path is unaffected — writes with - // NEEDS_CSUM and GSO types in the vnet header are processed by the - // kernel regardless of these flags. - // - // We explicitly set 0 rather than skipping the call, in case a - // previous user of this TAP fd set offloads to a non-zero value. - tap.set_offloads(0)?; + // TUN_F_CSUM is required for TUN_F_TSO4/6. + const TUN_F_CSUM: u32 = 1; + const TUN_F_TSO4: u32 = 2; + const TUN_F_TSO6: u32 = 4; + tap.set_offloads(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6)?; Ok(Self { tap: Arc::new(Mutex::new(Some(tap))), @@ -255,9 +251,23 @@ impl Queue for TapQueue { } let (hdr, _) = VirtioNetHdr::read_from_prefix(&self.buffer[..read_len]).unwrap(); - let rx_meta = parse_vnet_hdr(&hdr); let frame_start = size_of::(); let frame_len = read_len - size_of::(); + let frame = &self.buffer[frame_start..read_len]; + let rx_meta = parse_vnet_hdr(&hdr, frame); + + // With TUN_F_TSO4/6 the kernel may deliver GRO-coalesced + // frames larger than the guest RX buffer. Drop them + // gracefully instead of panicking in write_packet. + if frame_len > pool.capacity(rx) as usize { + tracing::warn!( + frame_len, + capacity = pool.capacity(rx), + "dropping rx packet: frame exceeds buffer capacity" + ); + continue; + } + pool.write_packet( rx, &RxMetadata { @@ -491,35 +501,110 @@ fn build_vnet_hdr(meta: &TxMetadata) -> VirtioNetHdr { } } +/// Parse the EtherType from the start of an Ethernet frame. +/// +/// Returns `(l2_len, is_ipv4, is_ipv6)`. Handles 802.1Q VLAN tags. +fn parse_ethertype(frame: &[u8]) -> (u8, bool, bool) { + const ETHERTYPE_IPV4: u16 = 0x0800; + const ETHERTYPE_IPV6: u16 = 0x86DD; + const ETHERTYPE_VLAN: u16 = 0x8100; + + if frame.len() < 14 { + return (0, false, false); + } + + let ethertype = u16::from_be_bytes([frame[12], frame[13]]); + if ethertype == ETHERTYPE_VLAN { + // VLAN-tagged: real EtherType is 4 bytes further. + if frame.len() < 18 { + return (0, false, false); + } + let inner = u16::from_be_bytes([frame[16], frame[17]]); + (18, inner == ETHERTYPE_IPV4, inner == ETHERTYPE_IPV6) + } else { + (14, ethertype == ETHERTYPE_IPV4, ethertype == ETHERTYPE_IPV6) + } +} + /// Parse a `VirtioNetHdr` from the TAP device into receive metadata. /// -/// Because we do not set any `TUN_F_*` RX offload flags (see -/// [`TapEndpoint::new`]), the kernel will never send us `NEEDS_CSUM` or GSO -/// packets. We only need to handle `DATA_VALID` (checksum verified by the -/// kernel) and the default case (no information). +/// With `TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6` enabled, the kernel may +/// deliver large coalesced packets with `NEEDS_CSUM` set and a non-NONE +/// `gso_type`. We translate these into `RxMetadata` GSO fields so the +/// virtio-net device can pass them to the guest as LRO packets. /// -/// The `gso_type` field should always be `GSO_NONE` since we didn't enable -/// receive-side GSO, but we still parse it defensively to extract L4 protocol -/// information if present. -fn parse_vnet_hdr(hdr: &VirtioNetHdr) -> RxMetadata { +/// `frame` is the Ethernet frame bytes (after the vnet header) used to +/// parse the actual L2 header length, including VLAN tags. +fn parse_vnet_hdr(hdr: &VirtioNetHdr, frame: &[u8]) -> RxMetadata { let (ip_checksum, l4_checksum) = if hdr.flags.data_valid() { (RxChecksumState::Good, RxChecksumState::Good) + } else if hdr.flags.needs_csum() && hdr.gso_size > 0 { + // GSO + NEEDS_CSUM: the L4 checksum is partial (pseudo-header + // only). Report as Unknown so the virtio layer does not set + // DATA_VALID — it will set NEEDS_CSUM in the virtio header + // instead, and the guest will compute per-segment checksums. + (RxChecksumState::Unknown, RxChecksumState::Unknown) + } else if hdr.flags.needs_csum() { + // Non-GSO + NEEDS_CSUM: the data integrity is fine but the L4 + // checksum field is partial. Since RxMetadata has no way to + // propagate NEEDS_CSUM for non-GSO packets, report as Good so + // the virtio header gets DATA_VALID and the guest accepts the + // packet. + (RxChecksumState::Good, RxChecksumState::Good) } else { (RxChecksumState::Unknown, RxChecksumState::Unknown) }; - let l4_protocol = match hdr.gso_type.protocol() { + let gso_protocol = hdr.gso_type.protocol(); + + let l4_protocol = match gso_protocol { VirtioNetHdrGsoProtocol::TCPV4 | VirtioNetHdrGsoProtocol::TCPV6 => L4Protocol::Tcp, VirtioNetHdrGsoProtocol::UDP | VirtioNetHdrGsoProtocol::UDP_L4 => L4Protocol::Udp, _ => L4Protocol::Unknown, }; + // Parse the actual Ethernet header to determine l2_len, handling + // VLAN tags. This mirrors the TX path's parse_ethertype logic. + let (parsed_l2_len, _, _) = parse_ethertype(frame); + + // Extract GSO metadata when the kernel delivers a coalesced packet. + let (l3_protocol, gso_size, l2_len, l3_len, l4_len) = if hdr.gso_size > 0 + && (gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 + || gso_protocol == VirtioNetHdrGsoProtocol::TCPV6) + { + let l3_proto = if gso_protocol == VirtioNetHdrGsoProtocol::TCPV4 { + L3Protocol::Ipv4 + } else { + L3Protocol::Ipv6 + }; + let l2 = parsed_l2_len; + let l3 = if l2 > 0 && hdr.csum_start >= l2 as u16 { + hdr.csum_start - l2 as u16 + } else { + 0 + }; + let l4 = if hdr.hdr_len > hdr.csum_start { + let v = hdr.hdr_len - hdr.csum_start; + if v <= u8::MAX as u16 { v as u8 } else { 0 } + } else { + 0 + }; + (l3_proto, hdr.gso_size, l2, l3, l4) + } else { + (L3Protocol::Unknown, 0, 0, 0, 0) + }; + RxMetadata { offset: 0, len: 0, ip_checksum, l4_checksum, l4_protocol, + l3_protocol, + gso_size, + l2_len, + l3_len, + l4_len, } } @@ -600,6 +685,11 @@ mod tests { assert_eq!(hdr.gso_type.protocol(), VirtioNetHdrGsoProtocol::NONE); } + // Minimal 14-byte Ethernet header with IPv4 EtherType for use in tests. + const ETH_IPV4: [u8; 14] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x00]; + // Minimal 14-byte Ethernet header with IPv6 EtherType for use in tests. + const ETH_IPV6: [u8; 14] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x86, 0xDD]; + #[test] fn rx_metadata_from_vnet_hdr_valid() { let hdr = VirtioNetHdr { @@ -607,31 +697,88 @@ mod tests { gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV4), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV4); assert_eq!(meta.ip_checksum, RxChecksumState::Good); assert_eq!(meta.l4_checksum, RxChecksumState::Good); assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } #[test] - fn rx_metadata_from_vnet_hdr_needs_csum_treated_as_unknown() { - // We don't set TUN_F_CSUM so the kernel should never send NEEDS_CSUM, - // but if it did, we conservatively treat it as Unknown (not Good). + fn rx_metadata_from_vnet_hdr_needs_csum_non_gso_treated_as_good() { + // Non-GSO + NEEDS_CSUM: data integrity is fine, L4 checksum is + // partial. Report as Good so DATA_VALID is set — RxMetadata has + // no way to propagate NEEDS_CSUM for non-GSO packets. let hdr = VirtioNetHdr { flags: VirtioNetHdrFlags::new().with_needs_csum(true), gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); + assert_eq!(meta.ip_checksum, RxChecksumState::Good); + assert_eq!(meta.l4_checksum, RxChecksumState::Good); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); + } + + #[test] + fn rx_metadata_from_vnet_hdr_needs_csum_gso_treated_as_unknown() { + // GSO + NEEDS_CSUM: report as Unknown so DATA_VALID is not set. + // The virtio layer will set NEEDS_CSUM in the header instead. + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), + gso_size: 1440, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); assert_eq!(meta.ip_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_protocol, L4Protocol::Tcp); } + #[test] + fn rx_metadata_from_vnet_hdr_gso_tcpv4() { + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV4), + hdr_len: 14 + 20 + 32, // eth + ipv4 + tcp w/options + gso_size: 1460, + csum_start: 14 + 20, + csum_offset: 16, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr, Ð_IPV4); + assert_eq!(meta.l3_protocol, L3Protocol::Ipv4); + assert_eq!(meta.gso_size, 1460); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 20); + assert_eq!(meta.l4_len, 32); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); + } + + #[test] + fn rx_metadata_from_vnet_hdr_gso_tcpv6() { + let hdr = VirtioNetHdr { + flags: VirtioNetHdrFlags::new().with_needs_csum(true), + gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::TCPV6), + hdr_len: 14 + 40 + 20, + gso_size: 1440, + csum_start: 14 + 40, + csum_offset: 16, + ..Default::default() + }; + let meta = parse_vnet_hdr(&hdr, Ð_IPV6); + assert_eq!(meta.l3_protocol, L3Protocol::Ipv6); + assert_eq!(meta.gso_size, 1440); + assert_eq!(meta.l2_len, 14); + assert_eq!(meta.l3_len, 40); + assert_eq!(meta.l4_len, 20); + assert_eq!(meta.l4_protocol, L4Protocol::Tcp); + } + #[test] fn rx_metadata_from_vnet_hdr_none() { let hdr = VirtioNetHdr::default(); - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, &[]); assert_eq!(meta.ip_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_checksum, RxChecksumState::Unknown); assert_eq!(meta.l4_protocol, L4Protocol::Unknown); @@ -644,7 +791,7 @@ mod tests { gso_type: VirtioNetHdrGso::new().with_protocol(VirtioNetHdrGsoProtocol::UDP), ..Default::default() }; - let meta = parse_vnet_hdr(&hdr); + let meta = parse_vnet_hdr(&hdr, &[]); assert_eq!(meta.l4_protocol, L4Protocol::Udp); } diff --git a/vm/devices/virtio/virtio_net/src/buffers.rs b/vm/devices/virtio/virtio_net/src/buffers.rs index 56e475461d..8d69478d32 100644 --- a/vm/devices/virtio/virtio_net/src/buffers.rs +++ b/vm/devices/virtio/virtio_net/src/buffers.rs @@ -3,10 +3,13 @@ use crate::VirtioNetHeader; use crate::VirtioNetHeaderFlags; +use crate::VirtioNetHeaderGso; +use crate::VirtioNetHeaderGsoProtocol; use crate::header_size; use guestmem::GuestMemory; use inspect::Inspect; use net_backend::BufferAccess; +use net_backend::L3Protocol; use net_backend::RxBufferSegment; use net_backend::RxId; use net_backend::RxMetadata; @@ -27,6 +30,10 @@ pub struct VirtioWorkPool { mem: GuestMemory, #[inspect(skip)] rx_packets: Vec>, + /// Whether the guest negotiated VIRTIO_NET_F_GUEST_TSO4. + guest_tso4: bool, + /// Whether the guest negotiated VIRTIO_NET_F_GUEST_TSO6. + guest_tso6: bool, } impl VirtioWorkPool { @@ -38,10 +45,12 @@ impl VirtioWorkPool { } /// Create a new instance. - pub fn new(mem: GuestMemory, queue_size: u16) -> Self { + pub fn new(mem: GuestMemory, queue_size: u16, guest_tso4: bool, guest_tso6: bool) -> Self { Self { mem, rx_packets: (0..queue_size).map(|_| None).collect(), + guest_tso4, + guest_tso6, } } @@ -170,8 +179,67 @@ impl BufferAccess for VirtioWorkPool { let data_valid = metadata.ip_checksum.is_valid() && metadata.l4_checksum.is_valid(); let flags = VirtioNetHeaderFlags::new().with_data_valid(data_valid); + // Build GSO fields when the backend indicates a large/coalesced packet + // and the guest has negotiated the corresponding GUEST_TSO feature. + let gso_allowed = match metadata.l3_protocol { + L3Protocol::Ipv4 => self.guest_tso4, + L3Protocol::Ipv6 => self.guest_tso6, + L3Protocol::Unknown => false, + }; + let (gso_type, gso_size, hdr_len, csum_start, csum_offset) = if metadata.gso_size > 0 + && metadata.l2_len > 0 + && metadata.l3_len > 0 + && metadata.l4_len > 0 + && gso_allowed + { + let gso_protocol = match metadata.l3_protocol { + L3Protocol::Ipv4 => VirtioNetHeaderGsoProtocol::TCPV4, + L3Protocol::Ipv6 => VirtioNetHeaderGsoProtocol::TCPV6, + L3Protocol::Unknown => VirtioNetHeaderGsoProtocol::NONE, + }; + let gso_type_byte: u8 = VirtioNetHeaderGso::new().with_protocol(gso_protocol).into(); + let total_hdr = metadata.l2_len as u16 + metadata.l3_len + metadata.l4_len as u16; + let csum_start = metadata.l2_len as u16 + metadata.l3_len; + // TCP checksum offset within TCP header is 16. + let csum_offset: u16 = 16; + ( + gso_type_byte, + metadata.gso_size, + total_hdr, + csum_start, + csum_offset, + ) + } else { + if metadata.gso_size > 0 { + tracelimit::warn_ratelimited!( + gso_size = metadata.gso_size, + l2_len = metadata.l2_len, + l3_len = metadata.l3_len, + l4_len = metadata.l4_len, + ?gso_allowed, + "cannot emit GSO metadata: missing header lengths or guest feature" + ); + } + (0, 0, 0, 0, 0) + }; + + // When GSO is active, set NEEDS_CSUM so the guest computes + // per-segment checksums, and clear DATA_VALID to avoid the + // contradictory combination that could cause the guest to + // skip required per-segment checksum computation. + let flags = if gso_size > 0 { + flags.with_needs_csum(true).with_data_valid(false) + } else { + flags + }; + let virtio_net_header = VirtioNetHeader { flags: flags.into(), + gso_type, + gso_size, + hdr_len, + csum_start, + csum_offset, num_buffers: 1, ..FromZeros::new_zeroed() }; diff --git a/vm/devices/virtio/virtio_net/src/lib.rs b/vm/devices/virtio/virtio_net/src/lib.rs index cb5227f149..183a9434be 100644 --- a/vm/devices/virtio/virtio_net/src/lib.rs +++ b/vm/devices/virtio/virtio_net/src/lib.rs @@ -268,6 +268,8 @@ impl VirtioDevice for Device { .with_mac(true) .with_csum(csum) .with_guest_csum(true) + .with_guest_tso4(true) + .with_guest_tso6(true) .with_host_tso4(host_tso) .with_host_tso6(host_tso); @@ -505,10 +507,20 @@ struct ActiveState { } impl ActiveState { - fn new(mem: GuestMemory, rx_queue_size: u16, tx_queue_size: u16) -> Self { + fn new( + mem: GuestMemory, + rx_queue_size: u16, + tx_queue_size: u16, + negotiated_features: NetworkFeaturesBank0, + ) -> Self { Self { pending_tx_packets: (0..tx_queue_size).map(|_| None).collect(), - pending_rx_packets: VirtioWorkPool::new(mem, rx_queue_size), + pending_rx_packets: VirtioWorkPool::new( + mem, + rx_queue_size, + negotiated_features.guest_tso4(), + negotiated_features.guest_tso6(), + ), data: ProcessingData::new(rx_queue_size, tx_queue_size), stats: Default::default(), } @@ -635,6 +647,7 @@ impl Device { guest_memory.clone(), virtio_state.rx_queue_size, virtio_state.tx_queue_size, + negotiated_features, ); let worker = Worker { virtio_state,